Skip to content

Commit 3104fcc

Browse files
committed
Revert changes to IO/Parquet.hs
1 parent b21897c commit 3104fcc

1 file changed

Lines changed: 49 additions & 22 deletions

File tree

dataframe-parquet/src/DataFrame/IO/Parquet.hs

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -401,21 +401,36 @@ getNonNullableColumn totalRows description chunks =
401401
PageDecoder a ->
402402
m Column
403403
go decoder =
404-
foldNonNullable totalRows $
405-
(\(vs, _, _) -> vs)
406-
<$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks)
404+
foldNonNullable totalRows (foldColumnPagesM description decoder chunks)
405+
406+
-- Decode a non-nullable BYTE_ARRAY (UTF-8) column straight into a single
407+
-- shared byte buffer + offsets ('PackedText'), instead of a boxed vector
408+
-- of per-row 'Text'. Each page's decoded 'Text' values (which share the
409+
-- chunk dictionary for dictionary-encoded pages) are appended by memcpy
410+
-- into one builder across all pages/chunks, then frozen once. This is the
411+
-- same representation the fast CSV reader uses and matches Arrow's string
412+
-- layout: no retained per-row 'Text' headers, no eager UTF-8 validation.
413+
goPackedText :: m Column
414+
goPackedText = do
415+
builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8))
416+
_ <-
417+
foldColumnDataPagesM
418+
description
419+
chunks
420+
( \() (dict, enc, nPresent, valBytes, _, _) ->
421+
liftIO (appendStringPageIO builder dict enc nPresent valBytes)
422+
)
423+
()
424+
chunk <- liftIO $ stToIO (freezeTextChunk builder)
425+
pure (mergeTextChunks [chunk])
407426

408427
unboxedGo ::
409428
forall a.
410429
(Columnable a, VU.Unbox a) =>
411430
UnboxedPageDecoder a ->
412431
m Column
413432
unboxedGo decoder =
414-
foldNonNullableUnboxed totalRows $
415-
(\(vs, _, _) -> vs)
416-
<$> Stream.unfoldMany
417-
(readPages description decoder)
418-
(Stream.fromList chunks)
433+
foldNonNullableUnboxed totalRows (foldColumnPagesM description decoder chunks)
419434

420435
-- | Decode an optional (nullable) column.
421436
{-# INLINEABLE getNullableColumn #-}
@@ -449,20 +464,36 @@ getNullableColumn totalRows description chunks =
449464
PageDecoder a ->
450465
m Column
451466
go decoder =
452-
foldNullable maxDef totalRows $
453-
(\(vs, ds, _) -> (vs, ds))
454-
<$> Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks)
467+
foldNullable maxDef totalRows (foldColumnPagesM description decoder chunks)
468+
469+
-- Nullable BYTE_ARRAY (UTF-8): decode straight into a 'PackedText' (shared
470+
-- byte buffer + offsets + validity bitmap) via the text builder, walking
471+
-- def-levels to interleave nulls. Avoids the boxed @Vector Text@ the
472+
-- generic 'foldNullable' path would build.
473+
goPackedTextNullable :: m Column
474+
goPackedTextNullable = do
475+
builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8))
476+
_ <-
477+
foldColumnDataPagesM
478+
description
479+
chunks
480+
( \() (dict, enc, nPresent, valBytes, defs, _) ->
481+
liftIO
482+
(appendNullableStringPageIO builder maxDef dict enc nPresent valBytes defs)
483+
)
484+
()
485+
chunk <- liftIO $ stToIO (freezeTextChunk builder)
486+
pure (mergeTextChunks [chunk])
455487
unboxedGo ::
456488
forall a.
457489
(Columnable a, VU.Unbox a) =>
458490
UnboxedPageDecoder a ->
459491
m Column
460492
unboxedGo decoder =
461-
foldNullableUnboxed maxDef totalRows $
462-
(\(vs, ds, _) -> (vs, ds))
463-
<$> Stream.unfoldMany
464-
(readPages description decoder)
465-
(Stream.fromList chunks)
493+
foldNullableUnboxed
494+
maxDef
495+
totalRows
496+
(foldColumnPagesM description decoder chunks)
466497

467498
-- | Decode a repeated (list/nested) column.
468499
{-# INLINEABLE getRepeatedColumn #-}
@@ -501,8 +532,7 @@ getRepeatedColumn description chunks =
501532
PageDecoder a ->
502533
m Column
503534
go decoder =
504-
foldRepeated maxRep maxDef $
505-
Stream.unfoldMany (readPages description decoder) (Stream.fromList chunks)
535+
foldRepeated maxRep maxDef (foldColumnPagesM description decoder chunks)
506536

507537
unboxedGo ::
508538
forall a.
@@ -515,10 +545,7 @@ getRepeatedColumn description chunks =
515545
UnboxedPageDecoder a ->
516546
m Column
517547
unboxedGo decoder =
518-
foldRepeatedUnboxed maxRep maxDef $
519-
Stream.unfoldMany
520-
(readPages description decoder)
521-
(Stream.fromList chunks)
548+
foldRepeatedUnboxed maxRep maxDef (foldColumnPagesM description decoder chunks)
522549

523550
-- Options application -----------------------------------------------------
524551

0 commit comments

Comments
 (0)