@@ -401,21 +401,36 @@ getNonNullableColumn totalRows description chunks =
401401 PageDecoder a ->
402402 m Column
403403 go decoder =
404- foldNonNullable totalRows $
405- (\ (vs, _, _) -> vs)
406- <$> Stream. unfoldMany (readPages description decoder) (Stream. fromList chunks)
404+ foldNonNullable totalRows (foldColumnPagesM description decoder chunks)
405+
406+ -- Decode a non-nullable BYTE_ARRAY (UTF-8) column straight into a single
407+ -- shared byte buffer + offsets ('PackedText'), instead of a boxed vector
408+ -- of per-row 'Text'. Each page's decoded 'Text' values (which share the
409+ -- chunk dictionary for dictionary-encoded pages) are appended by memcpy
410+ -- into one builder across all pages/chunks, then frozen once. This is the
411+ -- same representation the fast CSV reader uses and matches Arrow's string
412+ -- layout: no retained per-row 'Text' headers, no eager UTF-8 validation.
413+ goPackedText :: m Column
414+ goPackedText = do
415+ builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8 ))
416+ _ <-
417+ foldColumnDataPagesM
418+ description
419+ chunks
420+ ( \ () (dict, enc, nPresent, valBytes, _, _) ->
421+ liftIO (appendStringPageIO builder dict enc nPresent valBytes)
422+ )
423+ ()
424+ chunk <- liftIO $ stToIO (freezeTextChunk builder)
425+ pure (mergeTextChunks [chunk])
407426
408427 unboxedGo ::
409428 forall a .
410429 (Columnable a , VU. Unbox a ) =>
411430 UnboxedPageDecoder a ->
412431 m Column
413432 unboxedGo decoder =
414- foldNonNullableUnboxed totalRows $
415- (\ (vs, _, _) -> vs)
416- <$> Stream. unfoldMany
417- (readPages description decoder)
418- (Stream. fromList chunks)
433+ foldNonNullableUnboxed totalRows (foldColumnPagesM description decoder chunks)
419434
420435-- | Decode an optional (nullable) column.
421436{-# INLINEABLE getNullableColumn #-}
@@ -449,20 +464,36 @@ getNullableColumn totalRows description chunks =
449464 PageDecoder a ->
450465 m Column
451466 go decoder =
452- foldNullable maxDef totalRows $
453- (\ (vs, ds, _) -> (vs, ds))
454- <$> Stream. unfoldMany (readPages description decoder) (Stream. fromList chunks)
467+ foldNullable maxDef totalRows (foldColumnPagesM description decoder chunks)
468+
469+ -- Nullable BYTE_ARRAY (UTF-8): decode straight into a 'PackedText' (shared
470+ -- byte buffer + offsets + validity bitmap) via the text builder, walking
471+ -- def-levels to interleave nulls. Avoids the boxed @Vector Text@ the
472+ -- generic 'foldNullable' path would build.
473+ goPackedTextNullable :: m Column
474+ goPackedTextNullable = do
475+ builder <- liftIO $ stToIO (newTextBuilder totalRows (totalRows * 8 ))
476+ _ <-
477+ foldColumnDataPagesM
478+ description
479+ chunks
480+ ( \ () (dict, enc, nPresent, valBytes, defs, _) ->
481+ liftIO
482+ (appendNullableStringPageIO builder maxDef dict enc nPresent valBytes defs)
483+ )
484+ ()
485+ chunk <- liftIO $ stToIO (freezeTextChunk builder)
486+ pure (mergeTextChunks [chunk])
455487 unboxedGo ::
456488 forall a .
457489 (Columnable a , VU. Unbox a ) =>
458490 UnboxedPageDecoder a ->
459491 m Column
460492 unboxedGo decoder =
461- foldNullableUnboxed maxDef totalRows $
462- (\ (vs, ds, _) -> (vs, ds))
463- <$> Stream. unfoldMany
464- (readPages description decoder)
465- (Stream. fromList chunks)
493+ foldNullableUnboxed
494+ maxDef
495+ totalRows
496+ (foldColumnPagesM description decoder chunks)
466497
467498-- | Decode a repeated (list/nested) column.
468499{-# INLINEABLE getRepeatedColumn #-}
@@ -501,8 +532,7 @@ getRepeatedColumn description chunks =
501532 PageDecoder a ->
502533 m Column
503534 go decoder =
504- foldRepeated maxRep maxDef $
505- Stream. unfoldMany (readPages description decoder) (Stream. fromList chunks)
535+ foldRepeated maxRep maxDef (foldColumnPagesM description decoder chunks)
506536
507537 unboxedGo ::
508538 forall a .
@@ -515,10 +545,7 @@ getRepeatedColumn description chunks =
515545 UnboxedPageDecoder a ->
516546 m Column
517547 unboxedGo decoder =
518- foldRepeatedUnboxed maxRep maxDef $
519- Stream. unfoldMany
520- (readPages description decoder)
521- (Stream. fromList chunks)
548+ foldRepeatedUnboxed maxRep maxDef (foldColumnPagesM description decoder chunks)
522549
523550-- Options application -----------------------------------------------------
524551
0 commit comments