Skip to content

Cannot read nullable date/datetime columns returned as Arrow object #869

@cgiachalis

Description

@cgiachalis

Continue from #866, when storing date/datetime data with nullable = TRUE and then reading back as Arrow object returns nothing (null). See below.

Reprex

Create schema

library(tiledb)

uri <- tempfile()

domain <- tiledb_domain(tiledb_dim("row", c(0L, 100L), 100L, "INT32"))

attrib <- c(tiledb_attr("date",   type = "DATETIME_DAY", nullable = TRUE),
            tiledb_attr("datetime",   type = "DATETIME_MS", nullable = TRUE),
            tiledb_attr("nanosecs",   type = "DATETIME_NS", nullable = TRUE),
            tiledb_attr("float64",  type = "FLOAT64", nullable = TRUE))

schema <- tiledb_array_schema(domain, attrib, sparse=TRUE)
res <- tiledb_array_create(uri, schema)

Store data and read back as data.frame

df <- data.frame(row     =  1:2,
                 date    =  c(as.Date("1990-01-01"), as.Date(NA)),
                 datetime   =  c(as.POSIXct("1990-01-01"), as.POSIXct(NA)),
                 nanosecs   =  nanotime::as.nanotime(c(100, NA)),
                 float64 =  c(1, NA))

# Save data and read back as data.table
arr <- tiledb_array(uri, return_as="data.table")
arr[] <- df
arr[]
#>      row       date                  datetime     nanosecs                           float64
#>    <int>     <Date>                    <POSc>     <nanotime>                          <num>
#> 1:     1 1990-01-01       1990-01-01 00:00:00    1970-01-01T00:00:00.000000100+00:00     1
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56    <NA>                                   NA

Read back as arrow

# Now read back as arrow table
arr <- tiledb_array(uri, return_as = "arrow")

# print
arr[]
#> Table
#> 2 rows x 5 columns
#> $row <int32 not null>
#> $date <date32[day]>
#> $datetime <timestamp[ms]>
#> $nanosecs <timestamp[ns]>
#> $float64 <double>

# Convert to data.frame
data.table::as.data.table(arr[])
#>      row   date datetime nanosecs float64
#>    <int> <Date>   <POSc>   <POSc>   <num>
#> 1:     1   <NA>     <NA>     <NA>       1
#> 2:     2   <NA>     <NA>     <NA>      NA

# or equivalent conversion
arr[]$to_data_frame()
#>   row date datetime nanosecs float64
#> 1   1 <NA>     <NA>     <NA>       1
#> 2   2 <NA>     <NA>     <NA>      NA

# hmm...
arr[][["date"]]
#> ChunkedArray
#> <date32[day]>
#> [
#>   [
#>     null,
#>     null
#>   ]
#> ]

Relevant issues: #847, #866

With nullable = FALSE works as expected

library(tiledb)

uri <- tempfile()


domain <- tiledb_domain(tiledb_dim("row", c(0L, 100L), 100L, "INT32"))

attrib <- c(tiledb_attr("date",   type = "DATETIME_DAY", nullable = FALSE),
            tiledb_attr("datetime",   type = "DATETIME_MS", nullable = FALSE),
            tiledb_attr("nanosecs",   type = "DATETIME_NS", nullable = FALSE),
            tiledb_attr("float64",  type = "FLOAT64", nullable = FALSE))

schema <- tiledb_array_schema(domain, attrib, sparse=TRUE)
res <- tiledb_array_create(uri, schema)


df <- data.frame(row     =  1:2,
                 date    =  c(as.Date("1990-01-01"), as.Date(NA)),
                 datetime   =  c(as.POSIXct("1990-01-01"), as.POSIXct(NA)),
                 nanosecs   =  nanotime::as.nanotime(c(100, NA)),
                 float64 =  c(1, NA))

# Save data and read back as data.table
arr <- tiledb_array(uri, return_as="data.table")
arr[] <- df
arr[]
#>      row       date                  datetime
#>    <int>     <Date>                    <POSc>
#> 1:     1 1990-01-01       1990-01-01 00:00:00
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56
#>                               nanosecs float64
#>                             <nanotime>   <num>
#> 1: 1970-01-01T00:00:00.000000100+00:00       1
#> 2:                                <NA>      NA

# Now read back as arrow table
arr <- tiledb_array(uri, return_as = "arrow")

# print
arr[]
#> Table
#> 2 rows x 5 columns
#> $row <int32 not null>
#> $date <date32[day] not null>
#> $datetime <timestamp[ms] not null>
#> $nanosecs <timestamp[ns] not null>
#> $float64 <double not null>

# Convert to data.frame
data.table::as.data.table(arr[])
#>      row       date                  datetime            nanosecs float64
#>    <int>     <Date>                    <POSc>              <POSc>   <num>
#> 1:     1 1990-01-01       1990-01-01 00:00:00 1970-01-01 02:00:00       1
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56 1677-09-21 01:47:35      NA

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions