|
@@ -34,6 +34,61 @@ fn try_from<TSource, TDest: TryFrom<TSource>>(value: TSource) -> Result<TDest> {
|
|
Ok(cast)
|
|
Ok(cast)
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/// Returns the number of bytes needed to store the unicode character in UTF-9 whose
|
|
|
|
+/// encoding begins with the given byte. If the given byte is not valid as the first byte
|
|
|
|
+/// for any character, then a failed result with Error::InvalidUtf7Char is returned.
|
|
|
|
+fn num_bytes_for_char(first_byte: u8) -> Result<usize> {
|
|
|
|
+ if first_byte & 128u8 == 0 {
|
|
|
|
+ return Ok(1);
|
|
|
|
+ }
|
|
|
|
+ for bit in 2..5 {
|
|
|
|
+ let mask = 128u8 >> bit;
|
|
|
|
+ if mask & first_byte == 0 {
|
|
|
|
+ return Ok(bit);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ Err(Error::InvalidUtf8Char)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/// Returns the unicode code point of the character that is encoded in UTF-8 in the given buffer
|
|
|
|
+/// as a u32. If the given buffer is not of a valid length then a failed result is returned.
|
|
|
|
+/// No other verification is performed.
|
|
|
|
+fn u32_from_utf8(buf: &[u8]) -> Result<u32> {
|
|
|
|
+ const MASK: u8 = 0b0011_1111;
|
|
|
|
+
|
|
|
|
+ fn mask(byte: u8, mask: u8) -> u32 {
|
|
|
|
+ (byte & mask) as u32
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ fn two_bytes(buf: &[u8]) -> u32 {
|
|
|
|
+ mask(buf[0], 0b0001_1111) << 6 | mask(buf[1], MASK)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ fn three_bytes(buf: &[u8]) -> u32 {
|
|
|
|
+ mask(buf[0], 0b0000_1111) << 12 | mask(buf[1], MASK) << 6 | mask(buf[2], MASK)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ fn four_bytes(buf: &[u8]) -> u32 {
|
|
|
|
+ mask(buf[0], 0b0000_0111) << 18 | mask(buf[1], MASK) << 12 | mask(buf[2], MASK) << 6
|
|
|
|
+ | mask(buf[3], MASK)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ let code_point = match buf.len() {
|
|
|
|
+ 1 => Some(buf[0] as u32),
|
|
|
|
+ 2 => Some(two_bytes(buf)),
|
|
|
|
+ 3 => Some(three_bytes(buf)),
|
|
|
|
+ 4 => Some(four_bytes(buf)),
|
|
|
|
+ _ => None
|
|
|
|
+ };
|
|
|
|
+ code_point.ok_or(Error::InvalidUtf8Char)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+fn char_from_utf8(buf: &[u8]) -> Result<char> {
|
|
|
|
+ let result = u32_from_utf8(buf);
|
|
|
|
+ let option = char::from_u32(result?);
|
|
|
|
+ option.ok_or(Error::InvalidUtf8Char)
|
|
|
|
+}
|
|
|
|
+
|
|
impl<'de, T: Read + ?Sized> Deserializer<'de, T> {
|
|
impl<'de, T: Read + ?Sized> Deserializer<'de, T> {
|
|
pub fn new(input: &'de mut T) -> Self {
|
|
pub fn new(input: &'de mut T) -> Self {
|
|
Deserializer { input: input }
|
|
Deserializer { input: input }
|
|
@@ -170,10 +225,13 @@ impl<'de, 'a, T: Read> de::Deserializer<'de> for &'a mut Deserializer<'de, T> {
|
|
}
|
|
}
|
|
|
|
|
|
fn deserialize_char<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
|
|
fn deserialize_char<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
|
|
- // TODO: UTF-8 is a multi-byte encoding. I need to read the correct number of bytes here.
|
|
|
|
let byte = self.read_u8()?;
|
|
let byte = self.read_u8()?;
|
|
- let value = try_from(byte)?;
|
|
|
|
- visitor.visit_char(value)
|
|
|
|
|
|
+ let buf_len = num_bytes_for_char(byte);
|
|
|
|
+ let mut buf = vec![0; buf_len?];
|
|
|
|
+ buf[0] = byte;
|
|
|
|
+ self.read_exact(&mut buf[1..])?;
|
|
|
|
+ let result = char_from_utf8(&buf);
|
|
|
|
+ visitor.visit_char(result?)
|
|
}
|
|
}
|
|
|
|
|
|
fn deserialize_str<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
|
|
fn deserialize_str<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
|
|
@@ -340,7 +398,12 @@ mod test {
|
|
#[allow(unused_imports)]
|
|
#[allow(unused_imports)]
|
|
use serde::Deserialize;
|
|
use serde::Deserialize;
|
|
#[allow(unused_imports)]
|
|
#[allow(unused_imports)]
|
|
- use super::{from_vec, Result, Deserializer};
|
|
|
|
|
|
+ use super::{
|
|
|
|
+ from_vec,
|
|
|
|
+ num_bytes_for_char,
|
|
|
|
+ Result,
|
|
|
|
+ Deserializer
|
|
|
|
+ };
|
|
|
|
|
|
#[test]
|
|
#[test]
|
|
fn new() -> Result<()> {
|
|
fn new() -> Result<()> {
|
|
@@ -351,6 +414,24 @@ mod test {
|
|
Ok(())
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ #[test]
|
|
|
|
+ fn test_num_bytes_for_char() -> Result<()> {
|
|
|
|
+ fn test_case(c: char) -> Result<()> {
|
|
|
|
+ let len = c.len_utf8();
|
|
|
|
+ let mut buf: Vec<u8> = vec![0; len];
|
|
|
|
+ c.encode_utf8(buf.as_mut_slice());
|
|
|
|
+ let result = num_bytes_for_char(buf[0]);
|
|
|
|
+ assert_eq!(len, result?);
|
|
|
|
+ Ok(())
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ test_case('$')?;
|
|
|
|
+ test_case('£')?;
|
|
|
|
+ test_case('€')?;
|
|
|
|
+ test_case('😑')?;
|
|
|
|
+ Ok(())
|
|
|
|
+ }
|
|
|
|
+
|
|
#[test]
|
|
#[test]
|
|
fn deserialize_struct() -> Result<()> {
|
|
fn deserialize_struct() -> Result<()> {
|
|
#[derive(Debug, PartialEq, Deserialize)]
|
|
#[derive(Debug, PartialEq, Deserialize)]
|
|
@@ -472,9 +553,16 @@ mod test {
|
|
|
|
|
|
#[test]
|
|
#[test]
|
|
fn deserialize_char() -> Result<()> {
|
|
fn deserialize_char() -> Result<()> {
|
|
- let vec: Vec<u8> = vec![0xF0, 0x9F, 0x98, 0x8E];
|
|
|
|
- let result = from_vec(&vec);
|
|
|
|
- assert_eq!('😎', result?);
|
|
|
|
|
|
+ fn test_case(c: char, vec: Vec<u8>) -> Result<()> {
|
|
|
|
+ let result = from_vec(&vec);
|
|
|
|
+ assert_eq!(c, result?);
|
|
|
|
+ Ok(())
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ test_case('*', vec![0x2A])?;
|
|
|
|
+ test_case('£', vec![0xC2, 0xA3])?;
|
|
|
|
+ test_case('€', vec![0xE2, 0x82, 0xAC])?;
|
|
|
|
+ test_case('😎', vec![0xF0, 0x9F, 0x98, 0x8E])?;
|
|
Ok(())
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
|