فهرست منبع

Got Unicode character deserialization working.

Matthew Carr 3 سال پیش
والد
کامیت
6b5040b060
2فایلهای تغییر یافته به همراه98 افزوده شده و 8 حذف شده
  1. 95 7
      crates/node/src/serde_blocktree/de.rs
  2. 3 1
      crates/node/src/serde_blocktree/error.rs

+ 95 - 7
crates/node/src/serde_blocktree/de.rs

@@ -34,6 +34,61 @@ fn try_from<TSource, TDest: TryFrom<TSource>>(value: TSource) -> Result<TDest> {
     Ok(cast)
 }
 
+/// Returns the number of bytes needed to store the unicode character in UTF-9 whose
+/// encoding begins with the given byte. If the given byte is not valid as the first byte
+/// for any character, then a failed result with Error::InvalidUtf7Char is returned.
+fn num_bytes_for_char(first_byte: u8) -> Result<usize> {
+    if first_byte & 128u8 == 0  {
+        return Ok(1);
+    }
+    for bit in 2..5 {
+        let mask = 128u8 >> bit;
+        if mask & first_byte == 0 {
+            return Ok(bit);
+        }
+    }
+    Err(Error::InvalidUtf8Char)
+}
+
+/// Returns the unicode code point of the character that is encoded in UTF-8 in the given buffer
+/// as a u32. If the given buffer is not of a valid length then a failed result is returned.
+/// No other verification is performed.
+fn u32_from_utf8(buf: &[u8]) -> Result<u32> {
+    const MASK: u8 = 0b0011_1111;
+
+    fn mask(byte: u8, mask: u8) -> u32 {
+        (byte & mask) as u32
+    }
+
+    fn two_bytes(buf: &[u8]) -> u32 {
+       mask(buf[0], 0b0001_1111) << 6 | mask(buf[1], MASK)
+    }
+
+    fn three_bytes(buf: &[u8]) -> u32 {
+        mask(buf[0], 0b0000_1111) << 12 | mask(buf[1], MASK) << 6 | mask(buf[2], MASK)
+    }
+
+    fn four_bytes(buf: &[u8]) -> u32 {
+        mask(buf[0], 0b0000_0111) << 18 | mask(buf[1], MASK) << 12 | mask(buf[2], MASK) << 6
+            | mask(buf[3], MASK)
+    }
+
+    let code_point = match buf.len() {
+        1 => Some(buf[0] as u32),
+        2 => Some(two_bytes(buf)),
+        3 => Some(three_bytes(buf)),
+        4 => Some(four_bytes(buf)),
+        _ => None
+    };
+    code_point.ok_or(Error::InvalidUtf8Char)
+}
+
+fn char_from_utf8(buf: &[u8]) -> Result<char> {
+    let result = u32_from_utf8(buf);
+    let option = char::from_u32(result?);
+    option.ok_or(Error::InvalidUtf8Char)
+}
+
 impl<'de, T: Read + ?Sized> Deserializer<'de, T> {
     pub fn new(input: &'de mut T) -> Self {
         Deserializer { input: input }
@@ -170,10 +225,13 @@ impl<'de, 'a, T: Read> de::Deserializer<'de> for &'a mut Deserializer<'de, T> {
     }
 
     fn deserialize_char<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
-        // TODO: UTF-8 is a multi-byte encoding. I need to read the correct number of bytes here.
         let byte = self.read_u8()?;
-        let value = try_from(byte)?;
-        visitor.visit_char(value)
+        let buf_len = num_bytes_for_char(byte);
+        let mut buf = vec![0; buf_len?];
+        buf[0] = byte;
+        self.read_exact(&mut buf[1..])?;
+        let result = char_from_utf8(&buf);
+        visitor.visit_char(result?)
     }
 
     fn deserialize_str<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
@@ -340,7 +398,12 @@ mod test {
     #[allow(unused_imports)]
     use serde::Deserialize;
     #[allow(unused_imports)]
-    use super::{from_vec, Result, Deserializer};
+    use super::{
+        from_vec,
+        num_bytes_for_char,
+        Result,
+        Deserializer
+    };
     
     #[test]
     fn new() -> Result<()> {
@@ -351,6 +414,24 @@ mod test {
         Ok(())
     }
 
+    #[test]
+    fn test_num_bytes_for_char() -> Result<()> {
+        fn test_case(c: char) -> Result<()> {
+            let len = c.len_utf8();
+            let mut buf: Vec<u8> = vec![0; len];
+            c.encode_utf8(buf.as_mut_slice());
+            let result = num_bytes_for_char(buf[0]);
+            assert_eq!(len, result?);
+            Ok(())
+        }
+
+        test_case('$')?;
+        test_case('£')?;
+        test_case('€')?;
+        test_case('😑')?;
+        Ok(())
+    }
+
     #[test]
     fn deserialize_struct() -> Result<()> {
         #[derive(Debug, PartialEq, Deserialize)]
@@ -472,9 +553,16 @@ mod test {
 
     #[test]
     fn deserialize_char() -> Result<()> {
-        let vec: Vec<u8> = vec![0xF0, 0x9F, 0x98, 0x8E];
-        let result = from_vec(&vec);
-        assert_eq!('😎', result?);
+        fn test_case(c: char, vec: Vec<u8>) -> Result<()> {
+            let result = from_vec(&vec);
+            assert_eq!(c, result?);
+            Ok(())
+        }
+
+        test_case('*', vec![0x2A])?;
+        test_case('£', vec![0xC2, 0xA3])?;
+        test_case('€', vec![0xE2, 0x82, 0xAC])?;
+        test_case('😎', vec![0xF0, 0x9F, 0x98, 0x8E])?;
         Ok(())
     }
 

+ 3 - 1
crates/node/src/serde_blocktree/error.rs

@@ -13,6 +13,7 @@ pub enum Error {
     TooManyVariants(u32),
     TypeConversion,
     NotSupported(&'static str),
+    InvalidUtf8Char
 }
 
 impl std::error::Error for Error {}
@@ -32,7 +33,8 @@ impl Display for Error {
             ),
             Error::TypeConversion => formatter.write_str("type conversion failed"),
             Error::NotSupported(message) => formatter.write_fmt(format_args!(
-                "Operation is not supported: {}", message))
+                "Operation is not supported: {}", message)),
+            Error::InvalidUtf8Char => formatter.write_str("Invalid UTF-8 character encountered.")
         }
     }
 }