From d771bd512b2cc484a0b9286d66ecae1cd9a812c7 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Sat, 6 Jun 2026 16:45:33 -0700 Subject: [PATCH 1/7] nix: Make project config consistent * Make examples depend on granite version consistent with non-examples; non-examples want >= 0.6, examples wanted 0.6.* * Remove `flake.lock` from `.gitignore` -- this is an important file to have in version control; without it, reproducibility guarantees are out the window (since, if we're using `nixpkgs-*` as an input, without the `flake.lock`, the `nixpkgs-*` package set you use won't necessarily be the same `nixpkgs-*` package set that I use) * Add (almost) the rest of the dataframe-* packages to the package outputs / dev shell --- .gitignore | 3 +- dataframe-parquet/src/DataFrame/IO/Parquet.hs | 71 ++++++------------- examples/examples.cabal | 2 +- flake.lock | 61 ++++++++++++++++ flake.nix | 62 +++++++++++++++- 5 files changed, 144 insertions(+), 55 deletions(-) create mode 100644 flake.lock diff --git a/.gitignore b/.gitignore index 607f2e4a..ada5d0b5 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,6 @@ dataframe_benchmark/ bin/ coverage-html .DS_Store -flake.lock tags __pycache__ venv @@ -45,4 +44,4 @@ Cargo.lock # (transient; the committed *.db fixtures themselves stay tracked). *.db-wal *.db-shm -*.db-journal \ No newline at end of file +*.db-journal diff --git a/dataframe-parquet/src/DataFrame/IO/Parquet.hs b/dataframe-parquet/src/DataFrame/IO/Parquet.hs index 66e8ce0e..bcd7f653 100644 --- a/dataframe-parquet/src/DataFrame/IO/Parquet.hs +++ b/dataframe-parquet/src/DataFrame/IO/Parquet.hs @@ -401,28 +401,9 @@ getNonNullableColumn totalRows description chunks = PageDecoder a -> m Column go decoder = - foldNonNullable totalRows (foldColumnPagesM description decoder chunks) - - -- Decode a non-nullable BYTE_ARRAY (UTF-8) column straight into a single - -- shared byte buffer + offsets ('PackedText'), instead of a boxed vector - -- of per-row 'Text'. Each page's decoded 'Text' values (which share the - -- chunk dictionary for dictionary-encoded pages) are appended by memcpy - -- into one builder across all pages/chunks, then frozen once. This is the - -- same representation the fast CSV reader uses and matches Arrow's string - -- layout: no retained per-row 'Text' headers, no eager UTF-8 validation. - goPackedText :: m Column - goPackedText = do - builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8)) - _ <- - foldColumnDataPagesM - description - chunks - ( \() (dict, enc, nPresent, valBytes, _, _) -> - liftIO (appendStringPageIO builder dict enc nPresent valBytes) - ) - () - chunk <- liftIO $ stToIO (freezeTextChunk builder) - pure (mergeTextChunks [chunk]) + foldNonNullable totalRows $ + (\(vs, _, _) -> vs) + <$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) unboxedGo :: forall a. @@ -430,7 +411,11 @@ getNonNullableColumn totalRows description chunks = UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldNonNullableUnboxed totalRows (foldColumnPagesM description decoder chunks) + foldNonNullableUnboxed totalRows $ + (\(vs, _, _) -> vs) + <$> Stream.unfoldMany + (readPages description decoder) + (Stream.fromList chunks) -- | Decode an optional (nullable) column. {-# INLINEABLE getNullableColumn #-} @@ -464,36 +449,20 @@ getNullableColumn totalRows description chunks = PageDecoder a -> m Column go decoder = - foldNullable maxDef totalRows (foldColumnPagesM description decoder chunks) - - -- Nullable BYTE_ARRAY (UTF-8): decode straight into a 'PackedText' (shared - -- byte buffer + offsets + validity bitmap) via the text builder, walking - -- def-levels to interleave nulls. Avoids the boxed @Vector Text@ the - -- generic 'foldNullable' path would build. - goPackedTextNullable :: m Column - goPackedTextNullable = do - builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8)) - _ <- - foldColumnDataPagesM - description - chunks - ( \() (dict, enc, nPresent, valBytes, defs, _) -> - liftIO - (appendNullableStringPageIO builder maxDef dict enc nPresent valBytes defs) - ) - () - chunk <- liftIO $ stToIO (freezeTextChunk builder) - pure (mergeTextChunks [chunk]) + foldNullable maxDef totalRows $ + (\(vs, ds, _) -> (vs, ds)) + <$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) unboxedGo :: forall a. (Columnable a, VU.Unbox a) => UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldNullableUnboxed - maxDef - totalRows - (foldColumnPagesM description decoder chunks) + foldNullableUnboxed maxDef totalRows $ + (\(vs, ds, _) -> (vs, ds)) + <$> Stream.unfoldMany + (readPages description decoder) + (Stream.fromList chunks) -- | Decode a repeated (list/nested) column. {-# INLINEABLE getRepeatedColumn #-} @@ -532,7 +501,8 @@ getRepeatedColumn description chunks = PageDecoder a -> m Column go decoder = - foldRepeated maxRep maxDef (foldColumnPagesM description decoder chunks) + foldRepeated maxRep maxDef $ + Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) unboxedGo :: forall a. @@ -545,7 +515,10 @@ getRepeatedColumn description chunks = UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldRepeatedUnboxed maxRep maxDef (foldColumnPagesM description decoder chunks) + foldRepeatedUnboxed maxRep maxDef $ + Stream.unfoldMany + (readPages description decoder) + (Stream.fromList chunks) -- Options application ----------------------------------------------------- diff --git a/examples/examples.cabal b/examples/examples.cabal index 0cb1d9f7..e74248a2 100644 --- a/examples/examples.cabal +++ b/examples/examples.cabal @@ -139,7 +139,7 @@ executable examples cassava >= 0.1 && < 1, containers >= 0.6.7 && < 0.9, directory >= 1.3.0.0 && < 2, - granite ^>= 0.6, + granite >= 0.6 && < 1, hashable >= 1.2 && < 2, hasktorch, http-conduit, diff --git a/flake.lock b/flake.lock new file mode 100644 index 00000000..96f37ecf --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1780243769, + "narHash": "sha256-x5UQuRsH3MqI0U9afaXSNqzTPSeZlRLvFAav2Ux1pNw=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "331800de5053fcebacf6813adb5db9c9dca22a0c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix index b5e3f8e4..b1f06b80 100644 --- a/flake.nix +++ b/flake.nix @@ -14,14 +14,42 @@ repo = "granite"; owner = "mchav"; rev = "main"; - hash = "sha256-Z/o8gxMOBltKiaL0NEjMUyOvUljRvKErWeM6Ul3GM9k="; + hash = "sha256-jmmI2+kbqe+X/CDP986qQnUMGR35iNW5deNLovHpBHA="; + }; + pinchPkg = pkgs.fetchFromGitHub { + repo = "pinch"; + owner = "abhinav"; + rev = "v0.5.2.0"; + hash = "sha256-kuCS4EePc4aIONCvF0sOZt4pCazAq1z9+a/AY9b7Q6c="; + }; + networkRunPkg = pkgs.fetchFromGitHub { + repo = "network-run"; + owner = "kazu-yamamoto"; + rev = "v0.3.1"; + hash = "sha256-xyyf+Le2x9ACJBE4ua7wWHsfOQHNi7D+DksghZFh35I="; }; hsPkgs = pkgs.haskellPackages.extend (self: super: { granite = self.callCabal2nix "granite" granitePkg { }; + network-run = self.callCabal2nix "network-run" networkRunPkg { }; + pinch = self.callCabal2nix "pinch" pinchPkg { }; + dataframe-arrow = self.callCabal2nix "dataframe-arrow" ./dataframe-arrow { }; + dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; + dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; + dataframe-csv-th = self.callCabal2nix "dataframe-csv-th" ./dataframe-csv-th { }; dataframe-fastcsv = self.callCabal2nix "dataframe-fastcsv" ./dataframe-fastcsv { }; - dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; + # dataframe-fusion = self.callCabal2nix "dataframe-fusion" ./dataframe-fusion { }; dataframe-hasktorch = self.callCabal2nix "dataframe-hasktorch" ./dataframe-hasktorch { }; + dataframe-json = self.callCabal2nix "dataframe-json" ./dataframe-json { }; + dataframe-lazy = self.callCabal2nix "dataframe-lazy" ./dataframe-lazy { }; + dataframe-learn = self.callCabal2nix "dataframe-learn" ./dataframe-learn { }; + dataframe-operations = self.callCabal2nix "dataframe-operations" ./dataframe-operations { }; + dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { }; + dataframe-parquet-th = self.callCabal2nix "dataframe-parquet-th" ./dataframe-parquet-th { }; + dataframe-parsing = self.callCabal2nix "dataframe-parsing" ./dataframe-parsing { }; + dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; + dataframe-th = self.callCabal2nix "dataframe-th" ./dataframe-th { }; + dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { }; dataframe = self.callCabal2nix "dataframe" ./. { }; }); in @@ -29,17 +57,45 @@ packages = { default = hsPkgs.dataframe; dataframe = hsPkgs.dataframe; + dataframe-arrow = hsPkgs.dataframe-arrow; + dataframe-core = hsPkgs.dataframe-core; + dataframe-csv = hsPkgs.dataframe-csv; + dataframe-csv-th = hsPkgs.dataframe-csv-th; dataframe-fastcsv = hsPkgs.dataframe-fastcsv; + # dataframe-fusion = hsPkgs.dataframe-fusion; dataframe-hasktorch = hsPkgs.dataframe-hasktorch; + dataframe-json = hsPkgs.dataframe-json; + dataframe-lazy = hsPkgs.dataframe-lazy; + dataframe-learn = hsPkgs.dataframe-learn; + dataframe-operations = hsPkgs.dataframe-operations; + dataframe-parquet = hsPkgs.dataframe-parquet; + dataframe-parquet-th = hsPkgs.dataframe-parquet-th; + dataframe-parsing = hsPkgs.dataframe-parsing; dataframe-persistent = hsPkgs.dataframe-persistent; + dataframe-th = hsPkgs.dataframe-th; + dataframe-viz = hsPkgs.dataframe-viz; }; devShells.default = hsPkgs.shellFor { packages = ps: [ ps.dataframe + ps.dataframe-arrow + ps.dataframe-core + ps.dataframe-csv + ps.dataframe-csv-th ps.dataframe-fastcsv - ps.dataframe-persistent + # ps.dataframe-fusion ps.dataframe-hasktorch + ps.dataframe-json + ps.dataframe-lazy + ps.dataframe-learn + ps.dataframe-operations + ps.dataframe-parquet + ps.dataframe-parquet-th + ps.dataframe-parsing + ps.dataframe-persistent + ps.dataframe-th + ps.dataframe-viz ]; nativeBuildInputs = with hsPkgs; [ ghc From 28735cd213db06219a515ca001a6d1aa5b98a77e Mon Sep 17 00:00:00 2001 From: James Santucci Date: Tue, 30 Jun 2026 17:36:37 -0700 Subject: [PATCH 2/7] update granite rev --- flake.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flake.nix b/flake.nix index b1f06b80..e5b0f113 100644 --- a/flake.nix +++ b/flake.nix @@ -13,8 +13,9 @@ granitePkg = pkgs.fetchFromGitHub { repo = "granite"; owner = "mchav"; - rev = "main"; - hash = "sha256-jmmI2+kbqe+X/CDP986qQnUMGR35iNW5deNLovHpBHA="; + # main as of 2026/06/30 + rev = "b3e83fc42ef3a3e032f58072ae1962281a7b2b00"; + hash = "sha256-xT85Kdsk1tFD3+7Tuv69hpTwB/NPwJ1KFus1MfPIGBE="; }; pinchPkg = pkgs.fetchFromGitHub { repo = "pinch"; From 15c5e0525f0d30958ce5e3b919af50f793485f3e Mon Sep 17 00:00:00 2001 From: James Santucci Date: Tue, 30 Jun 2026 17:36:44 -0700 Subject: [PATCH 3/7] add other packages --- flake.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flake.nix b/flake.nix index e5b0f113..47d54dc3 100644 --- a/flake.nix +++ b/flake.nix @@ -38,9 +38,11 @@ dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; dataframe-csv-th = self.callCabal2nix "dataframe-csv-th" ./dataframe-csv-th { }; + dataframe-expr-serializer = self.callCabal2nix "dataframe-expr-serializer" ./dataframe-expr-serializer { }; dataframe-fastcsv = self.callCabal2nix "dataframe-fastcsv" ./dataframe-fastcsv { }; # dataframe-fusion = self.callCabal2nix "dataframe-fusion" ./dataframe-fusion { }; dataframe-hasktorch = self.callCabal2nix "dataframe-hasktorch" ./dataframe-hasktorch { }; + dataframe-huggingface = self.callCabal2nix "dataframe-huggingface" ./dataframe-huggingface { }; dataframe-json = self.callCabal2nix "dataframe-json" ./dataframe-json { }; dataframe-lazy = self.callCabal2nix "dataframe-lazy" ./dataframe-lazy { }; dataframe-learn = self.callCabal2nix "dataframe-learn" ./dataframe-learn { }; @@ -84,9 +86,11 @@ ps.dataframe-core ps.dataframe-csv ps.dataframe-csv-th + ps.dataframe-expr-serializer ps.dataframe-fastcsv # ps.dataframe-fusion ps.dataframe-hasktorch + ps.dataframe-huggingface ps.dataframe-json ps.dataframe-lazy ps.dataframe-learn From b21897c013e40df9dc0b6abcb39c9a1a43102741 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Tue, 30 Jun 2026 17:54:08 -0700 Subject: [PATCH 4/7] parallel -> 3.3.0.0 for df-learn + fmt --- flake.nix | 69 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/flake.nix b/flake.nix index 47d54dc3..2c8470f3 100644 --- a/flake.nix +++ b/flake.nix @@ -6,8 +6,14 @@ flake-utils.url = "github:numtide/flake-utils"; }; - outputs = { self, nixpkgs, flake-utils }: - flake-utils.lib.eachDefaultSystem (system: + outputs = + { + self, + nixpkgs, + flake-utils, + }: + flake-utils.lib.eachDefaultSystem ( + system: let pkgs = nixpkgs.legacyPackages.${system}; granitePkg = pkgs.fetchFromGitHub { @@ -30,31 +36,37 @@ hash = "sha256-xyyf+Le2x9ACJBE4ua7wWHsfOQHNi7D+DksghZFh35I="; }; - hsPkgs = pkgs.haskellPackages.extend (self: super: { - granite = self.callCabal2nix "granite" granitePkg { }; - network-run = self.callCabal2nix "network-run" networkRunPkg { }; - pinch = self.callCabal2nix "pinch" pinchPkg { }; - dataframe-arrow = self.callCabal2nix "dataframe-arrow" ./dataframe-arrow { }; - dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; - dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; - dataframe-csv-th = self.callCabal2nix "dataframe-csv-th" ./dataframe-csv-th { }; - dataframe-expr-serializer = self.callCabal2nix "dataframe-expr-serializer" ./dataframe-expr-serializer { }; - dataframe-fastcsv = self.callCabal2nix "dataframe-fastcsv" ./dataframe-fastcsv { }; - # dataframe-fusion = self.callCabal2nix "dataframe-fusion" ./dataframe-fusion { }; - dataframe-hasktorch = self.callCabal2nix "dataframe-hasktorch" ./dataframe-hasktorch { }; - dataframe-huggingface = self.callCabal2nix "dataframe-huggingface" ./dataframe-huggingface { }; - dataframe-json = self.callCabal2nix "dataframe-json" ./dataframe-json { }; - dataframe-lazy = self.callCabal2nix "dataframe-lazy" ./dataframe-lazy { }; - dataframe-learn = self.callCabal2nix "dataframe-learn" ./dataframe-learn { }; - dataframe-operations = self.callCabal2nix "dataframe-operations" ./dataframe-operations { }; - dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { }; - dataframe-parquet-th = self.callCabal2nix "dataframe-parquet-th" ./dataframe-parquet-th { }; - dataframe-parsing = self.callCabal2nix "dataframe-parsing" ./dataframe-parsing { }; - dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; - dataframe-th = self.callCabal2nix "dataframe-th" ./dataframe-th { }; - dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { }; - dataframe = self.callCabal2nix "dataframe" ./. { }; - }); + hsPkgs = pkgs.haskellPackages.extend ( + self: super: { + granite = self.callCabal2nix "granite" granitePkg { }; + network-run = self.callCabal2nix "network-run" networkRunPkg { }; + pinch = self.callCabal2nix "pinch" pinchPkg { }; + dataframe-arrow = self.callCabal2nix "dataframe-arrow" ./dataframe-arrow { }; + dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; + dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; + dataframe-csv-th = self.callCabal2nix "dataframe-csv-th" ./dataframe-csv-th { }; + dataframe-expr-serializer = + self.callCabal2nix "dataframe-expr-serializer" ./dataframe-expr-serializer + { }; + dataframe-fastcsv = self.callCabal2nix "dataframe-fastcsv" ./dataframe-fastcsv { }; + # dataframe-fusion = self.callCabal2nix "dataframe-fusion" ./dataframe-fusion { }; + dataframe-hasktorch = self.callCabal2nix "dataframe-hasktorch" ./dataframe-hasktorch { }; + dataframe-huggingface = self.callCabal2nix "dataframe-huggingface" ./dataframe-huggingface { }; + dataframe-json = self.callCabal2nix "dataframe-json" ./dataframe-json { }; + dataframe-lazy = self.callCabal2nix "dataframe-lazy" ./dataframe-lazy { }; + dataframe-learn = self.callCabal2nix "dataframe-learn" ./dataframe-learn { + parallel = pkgs.haskell.lib.dontCheck (self.callHackage "parallel" "3.3.0.0" { }); + }; + dataframe-operations = self.callCabal2nix "dataframe-operations" ./dataframe-operations { }; + dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { }; + dataframe-parquet-th = self.callCabal2nix "dataframe-parquet-th" ./dataframe-parquet-th { }; + dataframe-parsing = self.callCabal2nix "dataframe-parsing" ./dataframe-parsing { }; + dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; + dataframe-th = self.callCabal2nix "dataframe-th" ./dataframe-th { }; + dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { }; + dataframe = self.callCabal2nix "dataframe" ./. { }; + } + ); in { packages = { @@ -109,5 +121,6 @@ ]; withHoogle = true; }; - }); + } + ); } From 3104fccaef734b28058e512fbfc4184fadbee178 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Tue, 30 Jun 2026 17:57:00 -0700 Subject: [PATCH 5/7] Revert changes to IO/Parquet.hs --- dataframe-parquet/src/DataFrame/IO/Parquet.hs | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/dataframe-parquet/src/DataFrame/IO/Parquet.hs b/dataframe-parquet/src/DataFrame/IO/Parquet.hs index bcd7f653..66e8ce0e 100644 --- a/dataframe-parquet/src/DataFrame/IO/Parquet.hs +++ b/dataframe-parquet/src/DataFrame/IO/Parquet.hs @@ -401,9 +401,28 @@ getNonNullableColumn totalRows description chunks = PageDecoder a -> m Column go decoder = - foldNonNullable totalRows $ - (\(vs, _, _) -> vs) - <$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) + foldNonNullable totalRows (foldColumnPagesM description decoder chunks) + + -- Decode a non-nullable BYTE_ARRAY (UTF-8) column straight into a single + -- shared byte buffer + offsets ('PackedText'), instead of a boxed vector + -- of per-row 'Text'. Each page's decoded 'Text' values (which share the + -- chunk dictionary for dictionary-encoded pages) are appended by memcpy + -- into one builder across all pages/chunks, then frozen once. This is the + -- same representation the fast CSV reader uses and matches Arrow's string + -- layout: no retained per-row 'Text' headers, no eager UTF-8 validation. + goPackedText :: m Column + goPackedText = do + builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8)) + _ <- + foldColumnDataPagesM + description + chunks + ( \() (dict, enc, nPresent, valBytes, _, _) -> + liftIO (appendStringPageIO builder dict enc nPresent valBytes) + ) + () + chunk <- liftIO $ stToIO (freezeTextChunk builder) + pure (mergeTextChunks [chunk]) unboxedGo :: forall a. @@ -411,11 +430,7 @@ getNonNullableColumn totalRows description chunks = UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldNonNullableUnboxed totalRows $ - (\(vs, _, _) -> vs) - <$> Stream.unfoldMany - (readPages description decoder) - (Stream.fromList chunks) + foldNonNullableUnboxed totalRows (foldColumnPagesM description decoder chunks) -- | Decode an optional (nullable) column. {-# INLINEABLE getNullableColumn #-} @@ -449,20 +464,36 @@ getNullableColumn totalRows description chunks = PageDecoder a -> m Column go decoder = - foldNullable maxDef totalRows $ - (\(vs, ds, _) -> (vs, ds)) - <$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) + foldNullable maxDef totalRows (foldColumnPagesM description decoder chunks) + + -- Nullable BYTE_ARRAY (UTF-8): decode straight into a 'PackedText' (shared + -- byte buffer + offsets + validity bitmap) via the text builder, walking + -- def-levels to interleave nulls. Avoids the boxed @Vector Text@ the + -- generic 'foldNullable' path would build. + goPackedTextNullable :: m Column + goPackedTextNullable = do + builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8)) + _ <- + foldColumnDataPagesM + description + chunks + ( \() (dict, enc, nPresent, valBytes, defs, _) -> + liftIO + (appendNullableStringPageIO builder maxDef dict enc nPresent valBytes defs) + ) + () + chunk <- liftIO $ stToIO (freezeTextChunk builder) + pure (mergeTextChunks [chunk]) unboxedGo :: forall a. (Columnable a, VU.Unbox a) => UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldNullableUnboxed maxDef totalRows $ - (\(vs, ds, _) -> (vs, ds)) - <$> Stream.unfoldMany - (readPages description decoder) - (Stream.fromList chunks) + foldNullableUnboxed + maxDef + totalRows + (foldColumnPagesM description decoder chunks) -- | Decode a repeated (list/nested) column. {-# INLINEABLE getRepeatedColumn #-} @@ -501,8 +532,7 @@ getRepeatedColumn description chunks = PageDecoder a -> m Column go decoder = - foldRepeated maxRep maxDef $ - Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks) + foldRepeated maxRep maxDef (foldColumnPagesM description decoder chunks) unboxedGo :: forall a. @@ -515,10 +545,7 @@ getRepeatedColumn description chunks = UnboxedPageDecoder a -> m Column unboxedGo decoder = - foldRepeatedUnboxed maxRep maxDef $ - Stream.unfoldMany - (readPages description decoder) - (Stream.fromList chunks) + foldRepeatedUnboxed maxRep maxDef (foldColumnPagesM description decoder chunks) -- Options application ----------------------------------------------------- From 6b51abd6827d34854323804aca93012543ee242b Mon Sep 17 00:00:00 2001 From: James Santucci Date: Tue, 30 Jun 2026 18:11:49 -0700 Subject: [PATCH 6/7] Move package overrides inside what depends on them --- flake.nix | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flake.nix b/flake.nix index 2c8470f3..2f2d5fb6 100644 --- a/flake.nix +++ b/flake.nix @@ -38,9 +38,7 @@ hsPkgs = pkgs.haskellPackages.extend ( self: super: { - granite = self.callCabal2nix "granite" granitePkg { }; network-run = self.callCabal2nix "network-run" networkRunPkg { }; - pinch = self.callCabal2nix "pinch" pinchPkg { }; dataframe-arrow = self.callCabal2nix "dataframe-arrow" ./dataframe-arrow { }; dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; @@ -58,12 +56,16 @@ parallel = pkgs.haskell.lib.dontCheck (self.callHackage "parallel" "3.3.0.0" { }); }; dataframe-operations = self.callCabal2nix "dataframe-operations" ./dataframe-operations { }; - dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { }; + dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { + pinch = self.callCabal2nix "pinch" pinchPkg { }; + }; dataframe-parquet-th = self.callCabal2nix "dataframe-parquet-th" ./dataframe-parquet-th { }; dataframe-parsing = self.callCabal2nix "dataframe-parsing" ./dataframe-parsing { }; dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; dataframe-th = self.callCabal2nix "dataframe-th" ./dataframe-th { }; - dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { }; + dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { + granite = self.callCabal2nix "granite" granitePkg { }; + }; dataframe = self.callCabal2nix "dataframe" ./. { }; } ); From 849ea8fd459e546c33bde61e107578def14c633b Mon Sep 17 00:00:00 2001 From: James Santucci Date: Wed, 1 Jul 2026 17:24:52 -0700 Subject: [PATCH 7/7] Move additional haskell deps into inputs --- flake.lock | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- flake.nix | 40 +++++++++++++++++----------------------- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/flake.lock b/flake.lock index 96f37ecf..2b42c650 100644 --- a/flake.lock +++ b/flake.lock @@ -18,6 +18,38 @@ "type": "github" } }, + "granite": { + "flake": false, + "locked": { + "lastModified": 1782949848, + "narHash": "sha256-oPoDjgrep4DgOTH+UatooiUdVDMBjMRv+ai3fIvulTE=", + "owner": "mchav", + "repo": "granite", + "rev": "3d62c7ce2f02f73b1c0614e3721b7af27147f110", + "type": "github" + }, + "original": { + "owner": "mchav", + "repo": "granite", + "type": "github" + } + }, + "network-run": { + "flake": false, + "locked": { + "lastModified": 1763529229, + "narHash": "sha256-j3Pkvn/eXiciQQIIc+SkWiFSLqbVyAc0SfLyIRZUsv8=", + "owner": "kazu-yamamoto", + "repo": "network-run", + "rev": "f49d0eeafcecce3e26e66edb3209a04cba30defa", + "type": "github" + }, + "original": { + "owner": "kazu-yamamoto", + "repo": "network-run", + "type": "github" + } + }, "nixpkgs": { "locked": { "lastModified": 1780243769, @@ -34,10 +66,29 @@ "type": "github" } }, + "pinch": { + "flake": false, + "locked": { + "lastModified": 1763952216, + "narHash": "sha256-rpk7mqi4C77UFIRTDVWykE30jd5OY5K4l9LWiGo46f8=", + "owner": "abhinav", + "repo": "pinch", + "rev": "0bf7dddf7c3203d3c04aedb709a1c774f99ff796", + "type": "github" + }, + "original": { + "owner": "abhinav", + "repo": "pinch", + "type": "github" + } + }, "root": { "inputs": { "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs" + "granite": "granite", + "network-run": "network-run", + "nixpkgs": "nixpkgs", + "pinch": "pinch" } }, "systems": { diff --git a/flake.nix b/flake.nix index 2f2d5fb6..c3d683a5 100644 --- a/flake.nix +++ b/flake.nix @@ -4,41 +4,35 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; flake-utils.url = "github:numtide/flake-utils"; + pinch = { + url = "github:abhinav/pinch"; + flake = false; + }; + network-run = { + url = "github:kazu-yamamoto/network-run"; + flake = false; + }; + granite = { + url = "github:mchav/granite"; + flake = false; + }; }; outputs = - { + inputs@{ self, nixpkgs, flake-utils, + ... }: flake-utils.lib.eachDefaultSystem ( system: let pkgs = nixpkgs.legacyPackages.${system}; - granitePkg = pkgs.fetchFromGitHub { - repo = "granite"; - owner = "mchav"; - # main as of 2026/06/30 - rev = "b3e83fc42ef3a3e032f58072ae1962281a7b2b00"; - hash = "sha256-xT85Kdsk1tFD3+7Tuv69hpTwB/NPwJ1KFus1MfPIGBE="; - }; - pinchPkg = pkgs.fetchFromGitHub { - repo = "pinch"; - owner = "abhinav"; - rev = "v0.5.2.0"; - hash = "sha256-kuCS4EePc4aIONCvF0sOZt4pCazAq1z9+a/AY9b7Q6c="; - }; - networkRunPkg = pkgs.fetchFromGitHub { - repo = "network-run"; - owner = "kazu-yamamoto"; - rev = "v0.3.1"; - hash = "sha256-xyyf+Le2x9ACJBE4ua7wWHsfOQHNi7D+DksghZFh35I="; - }; hsPkgs = pkgs.haskellPackages.extend ( self: super: { - network-run = self.callCabal2nix "network-run" networkRunPkg { }; + network-run = self.callCabal2nix "network-run" inputs.network-run { }; dataframe-arrow = self.callCabal2nix "dataframe-arrow" ./dataframe-arrow { }; dataframe-core = self.callCabal2nix "dataframe-core" ./dataframe-core { }; dataframe-csv = self.callCabal2nix "dataframe-csv" ./dataframe-csv { }; @@ -57,14 +51,14 @@ }; dataframe-operations = self.callCabal2nix "dataframe-operations" ./dataframe-operations { }; dataframe-parquet = self.callCabal2nix "dataframe-parquet" ./dataframe-parquet { - pinch = self.callCabal2nix "pinch" pinchPkg { }; + pinch = self.callCabal2nix "pinch" inputs.pinch { }; }; dataframe-parquet-th = self.callCabal2nix "dataframe-parquet-th" ./dataframe-parquet-th { }; dataframe-parsing = self.callCabal2nix "dataframe-parsing" ./dataframe-parsing { }; dataframe-persistent = self.callCabal2nix "dataframe-persistent" ./dataframe-persistent { }; dataframe-th = self.callCabal2nix "dataframe-th" ./dataframe-th { }; dataframe-viz = self.callCabal2nix "dataframe-viz" ./dataframe-viz { - granite = self.callCabal2nix "granite" granitePkg { }; + granite = self.callCabal2nix "granite" inputs.granite { }; }; dataframe = self.callCabal2nix "dataframe" ./. { }; }