From 7cb1ffe100befaa6236f14c14081961c44d6ad21 Mon Sep 17 00:00:00 2001
From: Matthias Benkort <5680256+KtorZ@users.noreply.github.com>
Date: Sat, 13 Jan 2024 14:09:16 +0100
Subject: [PATCH] fix(codec): Fix flat encoding and decoding of arbitrarily
 size integers (#378)

This commits fixes the flat encoding and decoding (and consequently,
  the zigzag) for large integers in the following ways:

  - It removes support for encoding and decoding i128 values.

  - It optionally (feature = "num-bigint") introduces encoding and
    decoding of large sized integers through the num-bigint::BigInt
    type.

  Without the feature enabled, it is still possible to encode and decode
  isize values; but the use of i128 is now prohibited (as it would
  overflow on boundaries) in favor of arbitrarily sized integers.

  The commit also introduces a missing property roundtrip for encoding
  and decoding large integers, which was missing and thus, failed to
  identify the overflow problem.

  See related issue: https://github.com/aiken-lang/aiken/issues/796
---
 pallas-codec/Cargo.toml                 |  4 ++
 pallas-codec/src/flat/decode/decoder.rs | 26 ++++++-----
 pallas-codec/src/flat/decode/mod.rs     |  8 +++-
 pallas-codec/src/flat/encode/encoder.rs | 40 ++++++++++-------
 pallas-codec/src/flat/encode/mod.rs     |  8 +++-
 pallas-codec/src/flat/zigzag.rs         | 60 +++++++++++++++++--------
 pallas-codec/tests/flat.rs              | 31 +++++++++++++
 pallas-codec/tests/zigzag.rs            | 10 ++---
 8 files changed, 133 insertions(+), 54 deletions(-)

diff --git a/pallas-codec/Cargo.toml b/pallas-codec/Cargo.toml
index 0b70455..419d22b 100644
--- a/pallas-codec/Cargo.toml
+++ b/pallas-codec/Cargo.toml
@@ -14,9 +14,13 @@ authors = [
   "Kasey White <kwhitemsg@gmail.com>",
 ]
 
+[features]
+default = []
+
 [dependencies]
 hex = "0.4.3"
 minicbor = { version = "0.20", features = ["std", "half", "derive"] }
+num-bigint = { version = "0.4.4", optional = true }
 serde = { version = "1.0.143", features = ["derive"] }
 thiserror = "1.0.39"
 
diff --git a/pallas-codec/src/flat/decode/decoder.rs b/pallas-codec/src/flat/decode/decoder.rs
index 63dc8fd..24ef006 100644
--- a/pallas-codec/src/flat/decode/decoder.rs
+++ b/pallas-codec/src/flat/decode/decoder.rs
@@ -1,7 +1,9 @@
 use super::Decode;
-use crate::flat::zigzag;
-
 use super::Error;
+use crate::flat::zigzag::ZigZag;
+
+#[cfg(feature = "num-bigint")]
+use num_bigint::{BigInt, BigUint};
 
 #[derive(Debug)]
 pub struct Decoder<'b> {
@@ -24,7 +26,8 @@ impl<'b> Decoder<'b> {
         T::decode(self)
     }
 
-    /// Decode an integer of any size.
+    /// Decode an isize integer.
+    ///
     /// This is byte alignment agnostic.
     /// First we decode the next 8 bits of the buffer.
     /// We take the 7 least significant bits as the 7 least significant bits of
@@ -35,10 +38,11 @@ impl<'b> Decoder<'b> {
     /// any more bits. Finally we use zigzag to convert the unsigned integer
     /// back to a signed integer.
     pub fn integer(&mut self) -> Result<isize, Error> {
-        Ok(zigzag::to_isize(self.word()?))
+        Ok(self.word()?.zigzag())
     }
 
-    /// Decode an integer of 128 bits size.
+    /// Decode an integer of an arbitrary size..
+    ///
     /// This is byte alignment agnostic.
     /// First we decode the next 8 bits of the buffer.
     /// We take the 7 least significant bits as the 7 least significant bits of
@@ -48,8 +52,9 @@ impl<'b> Decoder<'b> {
     /// so on. If the most significant bit was instead 0 we stop decoding
     /// any more bits. Finally we use zigzag to convert the unsigned integer
     /// back to a signed integer.
-    pub fn big_integer(&mut self) -> Result<i128, Error> {
-        Ok(zigzag::to_i128(self.big_word()?))
+    #[cfg(feature = "num-bigint")]
+    pub fn big_integer(&mut self) -> Result<BigInt, Error> {
+        Ok(self.big_word()?.zigzag())
     }
 
     /// Decode a single bit of the buffer to get a bool.
@@ -162,15 +167,16 @@ impl<'b> Decoder<'b> {
     /// filling in the next 7 least significant bits of the unsigned integer and
     /// so on. If the most significant bit was instead 0 we stop decoding
     /// any more bits.
-    pub fn big_word(&mut self) -> Result<u128, Error> {
+    #[cfg(feature = "num-bigint")]
+    pub fn big_word(&mut self) -> Result<BigUint, Error> {
         let mut leading_bit = 1;
-        let mut final_word: u128 = 0;
+        let mut final_word: BigUint = (0 as u8).into();
         let mut shl: u128 = 0;
         // continue looping if lead bit is 1 which is 128 as a u8 otherwise exit
         while leading_bit > 0 {
             let word8 = self.bits8(8)?;
             let word7 = word8 & 127;
-            final_word |= (word7 as u128) << shl;
+            final_word |= <u8 as Into<BigUint>>::into(word7) << shl;
             shl += 7;
             leading_bit = word8 & 128;
         }
diff --git a/pallas-codec/src/flat/decode/mod.rs b/pallas-codec/src/flat/decode/mod.rs
index 431c88a..4e004ca 100644
--- a/pallas-codec/src/flat/decode/mod.rs
+++ b/pallas-codec/src/flat/decode/mod.rs
@@ -3,6 +3,9 @@ mod error;
 
 use crate::flat::filler::Filler;
 
+#[cfg(feature = "num-bigint")]
+use num_bigint::BigInt;
+
 pub use decoder::Decoder;
 pub use error::Error;
 
@@ -36,9 +39,10 @@ impl Decode<'_> for isize {
     }
 }
 
-impl Decode<'_> for i128 {
+#[cfg(feature = "num-bigint")]
+impl Decode<'_> for BigInt {
     fn decode(d: &mut Decoder) -> Result<Self, Error> {
-        d.big_integer()
+        Ok(d.big_integer()?.into())
     }
 }
 
diff --git a/pallas-codec/src/flat/encode/encoder.rs b/pallas-codec/src/flat/encode/encoder.rs
index d17525b..8a95238 100644
--- a/pallas-codec/src/flat/encode/encoder.rs
+++ b/pallas-codec/src/flat/encode/encoder.rs
@@ -1,7 +1,9 @@
 use super::Encode;
-use crate::flat::zigzag;
-
 use super::Error;
+use crate::flat::zigzag::ZigZag;
+
+#[cfg(feature = "num-bigint")]
+use num_bigint::{BigInt, BigUint};
 
 pub struct Encoder {
     pub buffer: Vec<u8>,
@@ -89,7 +91,8 @@ impl Encoder {
         Ok(self)
     }
 
-    /// Encode an integer of any size.
+    /// Encode an isize integer.
+    ///
     /// This is byte alignment agnostic.
     /// First we use zigzag once to double the number and encode the negative
     /// sign as the least significant bit. Next we encode the 7 least
@@ -97,25 +100,21 @@ impl Encoder {
     /// 127 we encode a leading 1 followed by repeating the encoding above for
     /// the next 7 bits and so on.
     pub fn integer(&mut self, i: isize) -> &mut Self {
-        let i = zigzag::to_usize(i);
-
-        self.word(i);
-
+        self.word(i.zigzag());
         self
     }
 
-    /// Encode an integer of 128 bits size.
+    /// Encode an arbitrarily sized integer.
+    ///
     /// This is byte alignment agnostic.
     /// First we use zigzag once to double the number and encode the negative
     /// sign as the least significant bit. Next we encode the 7 least
     /// significant bits of the unsigned integer. If the number is greater than
     /// 127 we encode a leading 1 followed by repeating the encoding above for
     /// the next 7 bits and so on.
-    pub fn big_integer(&mut self, i: i128) -> &mut Self {
-        let i = zigzag::to_u128(i);
-
-        self.big_word(i);
-
+    #[cfg(feature = "num-bigint")]
+    pub fn big_integer(&mut self, i: BigInt) -> &mut Self {
+        self.big_word(i.zigzag());
         self
     }
 
@@ -181,18 +180,25 @@ impl Encoder {
     /// We encode the 7 least significant bits of the unsigned byte. If the char
     /// value is greater than 127 we encode a leading 1 followed by
     /// repeating the above for the next 7 bits and so on.
-    pub fn big_word(&mut self, c: u128) -> &mut Self {
+    #[cfg(feature = "num-bigint")]
+    pub fn big_word(&mut self, c: BigUint) -> &mut Self {
         let mut d = c;
+        let zero = (0 as u8).into();
         loop {
-            let mut w = (d & 127) as u8;
+            let m: usize = 127;
+            let mut w = (d.clone() & <usize as Into<BigUint>>::into(m))
+                .to_bytes_be()
+                .pop()
+                .unwrap();
+
             d >>= 7;
 
-            if d != 0 {
+            if d != zero {
                 w |= 128;
             }
             self.bits(8, w);
 
-            if d == 0 {
+            if d == zero {
                 break;
             }
         }
diff --git a/pallas-codec/src/flat/encode/mod.rs b/pallas-codec/src/flat/encode/mod.rs
index 8b30bc9..7d81a5f 100644
--- a/pallas-codec/src/flat/encode/mod.rs
+++ b/pallas-codec/src/flat/encode/mod.rs
@@ -3,6 +3,9 @@ mod error;
 
 use crate::flat::filler::Filler;
 
+#[cfg(feature = "num-bigint")]
+use num_bigint::BigInt;
+
 pub use encoder::Encoder;
 pub use error::Error;
 
@@ -26,9 +29,10 @@ impl Encode for u8 {
     }
 }
 
-impl Encode for i128 {
+#[cfg(feature = "num-bigint")]
+impl Encode for BigInt {
     fn encode(&self, e: &mut Encoder) -> Result<(), Error> {
-        e.big_integer(*self);
+        e.big_integer(self.clone());
 
         Ok(())
     }
diff --git a/pallas-codec/src/flat/zigzag.rs b/pallas-codec/src/flat/zigzag.rs
index 4b9f4c4..9631e29 100644
--- a/pallas-codec/src/flat/zigzag.rs
+++ b/pallas-codec/src/flat/zigzag.rs
@@ -1,27 +1,51 @@
-pub fn to_usize(x: isize) -> usize {
-    let double_x = x << 1;
+#[cfg(feature = "num-bigint")]
+use num_bigint::{BigInt, BigUint, ToBigInt};
 
-    if x.is_positive() || x == 0 {
-        double_x as usize
-    } else {
-        (-double_x - 1) as usize
+pub trait ZigZag {
+    type Zag;
+    fn zigzag(self) -> Self::Zag;
+}
+
+#[cfg(feature = "num-bigint")]
+impl ZigZag for BigInt {
+    type Zag = BigUint;
+
+    fn zigzag(self) -> Self::Zag where {
+        if self >= 0.into() {
+            self << 1
+        } else {
+            let double: BigInt = self << 1;
+            -double - <u8 as Into<BigInt>>::into(1)
+        }
+        .to_biguint()
+        .expect("number is positive")
     }
 }
 
-pub fn to_isize(u: usize) -> isize {
-    ((u >> 1) as isize) ^ (-((u & 1) as isize))
-}
+impl ZigZag for isize {
+    type Zag = usize;
 
-pub fn to_u128(x: i128) -> u128 {
-    let double_x = x << 1;
-
-    if x.is_positive() || x == 0 {
-        double_x as u128
-    } else {
-        (-double_x - 1) as u128
+    fn zigzag(self) -> Self::Zag where {
+        let bits = isize::BITS as i128;
+        let i = self as i128;
+        ((i << 1) ^ (i >> (bits - 1))) as usize
     }
 }
 
-pub fn to_i128(u: u128) -> i128 {
-    ((u >> 1) as i128) ^ (-((u & 1) as i128))
+#[cfg(feature = "num-bigint")]
+impl ZigZag for BigUint {
+    type Zag = BigInt;
+
+    fn zigzag(self) -> Self::Zag where {
+        let i = self.to_bigint().expect("always possible");
+        (i.clone() >> 1) ^ -(i & <u8 as Into<BigInt>>::into(1))
+    }
+}
+
+impl ZigZag for usize {
+    type Zag = isize;
+
+    fn zigzag(self) -> Self::Zag where {
+        ((self >> 1) as isize) ^ -((self & 1) as isize)
+    }
 }
diff --git a/pallas-codec/tests/flat.rs b/pallas-codec/tests/flat.rs
index 9d503b6..e63724f 100644
--- a/pallas-codec/tests/flat.rs
+++ b/pallas-codec/tests/flat.rs
@@ -8,6 +8,37 @@ prop_compose! {
     }
 }
 
+#[cfg(feature = "num-bigint")]
+mod bigint {
+    use super::arb_big_vec;
+    use num_bigint::{BigInt, Sign};
+    use pallas_codec::flat::{decode, encode};
+    use proptest::prelude::*;
+
+    prop_compose! {
+        fn arb_isize()(i: isize) -> BigInt {
+            i.into()
+        }
+    }
+
+    fn arb_bigint() -> impl Strategy<Value = BigInt> {
+        prop_oneof![
+            arb_isize(),
+            arb_big_vec().prop_map(|xs| BigInt::from_bytes_be(Sign::Plus, &xs)),
+            arb_big_vec().prop_map(|xs| BigInt::from_bytes_be(Sign::Minus, &xs))
+        ]
+    }
+
+    proptest! {
+        #[test]
+        fn encode_bigint(x in arb_bigint()) {
+            let bytes = encode(&x).unwrap();
+            let decoded: BigInt = decode(&bytes).unwrap();
+            assert_eq!(decoded, x);
+        }
+    }
+}
+
 #[test]
 fn encode_bool() {
     let bytes = encode(&true).unwrap();
diff --git a/pallas-codec/tests/zigzag.rs b/pallas-codec/tests/zigzag.rs
index 2901b74..f859a01 100644
--- a/pallas-codec/tests/zigzag.rs
+++ b/pallas-codec/tests/zigzag.rs
@@ -1,18 +1,18 @@
-use pallas_codec::flat::zigzag::{to_isize, to_usize};
+use pallas_codec::flat::zigzag::ZigZag;
 use proptest::prelude::*;
 
 proptest! {
     #[test]
     fn zigzag(i: isize) {
-        let u = to_usize(i);
-        let converted_i = to_isize(u);
+        let u = i.zigzag();
+        let converted_i = u.zigzag();
         assert_eq!(converted_i, i);
     }
 
     #[test]
     fn zagzig(u: usize) {
-        let i = to_isize(u);
-        let converted_u = to_usize(i);
+        let i = u.zigzag();
+        let converted_u = i.zigzag();
         assert_eq!(converted_u, u);
     }
 }