diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index 422232f546..ddba563335 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -287,7 +287,10 @@ jobs: org.apache.comet.exec.CometNativeReaderSuite org.apache.comet.CometIcebergNativeSuite org.apache.comet.CometIcebergRewriteActionSuite + org.apache.comet.CometIcebergWriteActionSuite + org.apache.comet.CometIcebergWriteDetectionSuite org.apache.comet.iceberg.IcebergReflectionSuite + org.apache.comet.serde.operator.IcebergWriteProtoTranslationSuite org.apache.comet.csv.CometCsvNativeReadSuite org.apache.comet.CometFuzzTestSuite org.apache.comet.CometFuzzIcebergSuite diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index d0a03eeb75..76593e620f 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -129,7 +129,10 @@ jobs: org.apache.comet.exec.CometNativeReaderSuite org.apache.comet.CometIcebergNativeSuite org.apache.comet.CometIcebergRewriteActionSuite + org.apache.comet.CometIcebergWriteActionSuite + org.apache.comet.CometIcebergWriteDetectionSuite org.apache.comet.iceberg.IcebergReflectionSuite + org.apache.comet.serde.operator.IcebergWriteProtoTranslationSuite org.apache.comet.csv.CometCsvNativeReadSuite org.apache.comet.CometFuzzTestSuite org.apache.comet.CometFuzzIcebergSuite diff --git a/docs/source/user-guide/latest/iceberg-writes.md b/docs/source/user-guide/latest/iceberg-writes.md new file mode 100644 index 0000000000..d3ad6ee4d9 --- /dev/null +++ b/docs/source/user-guide/latest/iceberg-writes.md @@ -0,0 +1,331 @@ + + +# Accelerating Apache Iceberg V2 Writes using Comet + +## Overview + +Comet accelerates Iceberg V2 copy-on-write table writes in two complementary layers: + +1. **Split-operator plan** (on by toggle). Comet rewrites Iceberg's single + `V2ExistingTableWriteExec` command into a pair of operators — a **committer** and a + **writer** — so that AQE (and Comet's columnar rules) can see and re-optimise the + data sub-query. The actual file-writing work still runs through Iceberg's stock JVM writer. +2. **Native parquet write** (further toggle). Once the split-operator plan is in place, the + writer can be swapped for a native variant that delegates the per-task parquet write to + [iceberg-rust](https://github.com/apache/iceberg-rust). The committer / table-metadata + update / cache-refresh stay on the JVM exactly as before. + +Both layers fall back to plain Iceberg-Java when they can't safely apply — and both layers are +configured per-session, so you can opt into the split plan but skip the native write while it +burns in. + +## What changes about the Iceberg plan + +### Iceberg's stock V2 plan + +``` +V2ExistingTableWriteExec(write, query) ← single V2 command; commit + write together +└── ← data sub-query (scans, shuffles, ...) +``` + +This is a single command. Spark's `InsertAdaptiveSparkPlan` runs it as a leaf from AQE's +perspective — AQE never sees the data sub-query inside, so Comet's columnar rules can't convert +the scans / shuffles inside it either. + +### Comet's split plan + +``` +IcebergCommitExec(batchWrite, refreshCache) ← committer; runs BatchWrite.commit() +└── AdaptiveSparkPlanExec ← AQE bubble (Spark inserts this) + └── IcebergWriteExec(batchWrite) ← writer (UnaryExecNode); per-task write + └── ← data sub-query: now visible to AQE / Comet +``` + +The committer keeps Iceberg's commit semantics intact. The writer is a normal +`UnaryExecNode`, so AQE wraps it whenever the data sub-query has a shuffle, and Comet's +standard `transformUp` rules can convert the scans / projects / sorts / exchanges inside the +AQE bubble to their Comet counterparts (`CometScan`, `CometProject`, `CometColumnarExchange`, +…). + +### What flows between the two execs + +The writer produces **one row per Spark task** with a single `BINARY` column: +Java-serialised bytes of an Iceberg `SparkWrite$TaskCommit(DataFile[])`. The committer +`executeCollect()`s that RDD, deserialises each row's bytes back into a `WriterCommitMessage`, +and hands the resulting array to `BatchWrite.commit(messages)` — the same call Iceberg-Java +would have made internally. + +## Configuration + +Standard Comet + Iceberg setup ([`iceberg.md`](iceberg.md)) plus two toggles for the write-side +behaviour: + +``` +# Standard Comet / Iceberg wiring +spark.plugins=org.apache.spark.CometPlugin +spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.=org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog..type=hadoop # or hive / glue / rest / ... +spark.sql.catalog..warehouse=... + +# Comet write toggles (both off by default; can be turned on independently) +spark.comet.write.iceberg.splitOperator.enabled=true # layer 1: split-operator plan +spark.comet.write.iceberg.nativeAcceleration=true # layer 2: native parquet write +``` + +`IcebergSparkSessionExtensions` is **mandatory on Spark 3.4** for `UPDATE` / `MERGE` on V2 tables — +Spark 3.4's stock planner rejects `UPDATE TABLE` on V2 sources, and Iceberg's analyzer extension +(`RewriteUpdateTable` / `RewriteMergeIntoTable`) is what provides the rewrite. On Spark 3.5+ the +extension is optional but recommended (Spark added native row-level operation support in 3.5). +`INSERT INTO` / `INSERT OVERWRITE` / `DELETE FROM` work without the extension on every Spark +version. + +If `splitOperator.enabled=false`, Comet leaves Iceberg's stock plan alone — every write goes +straight through Iceberg-Java. + +## Spark version compatibility + +Copy-on-write coverage is identical across versions (`UPDATE` / `MERGE` on 3.4 requires +`IcebergSparkSessionExtensions`). Merge-on-read (`WriteDelta`) is intentionally left on Spark's +default path: the per-task `DeltaWriter` is row-dispatched and no native acceleration is +planned, so routing it through the split-operator plan would add planning complexity for no +realisable benefit. + +| Capability | 3.4 | 3.5 | 4.0 | +| ------------------------------------------- | ----- | --- | --- | +| `INSERT INTO` (`AppendData`) | ✅ | ✅ | ✅ | +| `INSERT OVERWRITE` (static + dynamic) | ✅ | ✅ | ✅ | +| `DELETE FROM` copy-on-write (`ReplaceData`) | ✅ | ✅ | ✅ | +| `UPDATE` / `MERGE INTO` copy-on-write | ✅[¹] | ✅ | ✅ | + +- [¹] Requires `IcebergSparkSessionExtensions` (see Configuration above). + +Native parquet write coverage is more selective: + +| Write | Spark 3.4 | Spark 3.5 | Spark 4.0 | +| --------------------------------------- | ----------- | ----------- | ----------- | +| `AppendData` (`INSERT INTO`) | native | native | native | +| `OverwriteByExpression` (static) | native | native | native | +| `OverwritePartitionsDynamic` | native | native | native | +| `ReplaceData` (CoW `DELETE` / `UPDATE`) | native | native | native | +| `ReplaceData` (CoW `MERGE`) | fallback[²] | fallback[²] | fallback[²] | + +- [²] MERGE flows through Spark's (or Iceberg 1.5.2's) `MergeRowsExec`, which per-row dispatches + the matched / not-matched / not-matched-by-source clauses. There's no native equivalent in + Comet _yet_ — this is planned (see Known limitations). The scan / join / sort / shuffle parts + still run native; only the per-row merge dispatch and the file write stay JVM. + +## Fallback triggers + +Before accepting a write into the native path, the serde checks for things iceberg-rust either +doesn't support or implements differently from iceberg-java. The first matching trigger short- +circuits the conversion and the plan stays on the JVM two-op path. + +Each row attributes the gap to the layer that's the actual root cause: **iceberg-rust** (the +high-level Iceberg writer / commit / metadata logic) or **parquet-rs** (the underlying Apache +Parquet Rust encoder that iceberg-rust drives). + +| Property / state | Falls back when … | Why | +| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| resolved write format | `SparkWrite.format` (i.e. effective `write-format` option overlaid on `write.format.default`) is not `parquet` | **iceberg-rust**: writer stack is parquet-only today. The check reads the resolved format, so per-write `option("write-format", "orc")` is honoured even when the table default is parquet. | +| `write.object-storage.enabled` | `true` | **iceberg-rust**: no hashed-prefix object-storage location generator | +| `write.location-provider.impl` | set | **iceberg-rust**: can't load a Java `LocationProvider` class | +| `format-version` | `>= 3` | **iceberg-rust**: V3 row lineage / DVs / variant types not implemented | +| `encryption.*` (any key) | set | **iceberg-rust + parquet-rs**: no Parquet modular encryption support in either layer | +| `write.metadata.metrics.default` | mentions `counts` | **iceberg-rust**: doesn't implement Iceberg's `counts`-without-bounds metrics mode. parquet-rs supports stats on or off but has no "value counts only" knob, so iceberg-rust would need to track and emit those counts itself; it doesn't. | +| `write.metadata.metrics.default` | `none` | **iceberg-rust**: always populates `column_sizes` / `value_counts` / `null_value_counts` / bounds from the parquet footer regardless of the requested mode; iceberg-java's `none` emits an empty `DataFile.metrics`, so a native write would produce strictly richer manifests. | +| `write.metadata.metrics.column.` | any column set to `counts` | **iceberg-rust**: same as default `counts`, per-column | +| `write.metadata.metrics.column.` | any column set to `none` | **iceberg-rust**: same as default `none`, per-column | +| `write.parquet.bloom-filter-max-bytes` | explicitly set | **iceberg-rust**: doesn't enforce iceberg-java's bloom-filter byte cap. parquet-rs has no global cap parameter on `WriterProperties`, so iceberg-rust would need to clamp NDV / FPP itself before passing them through. | +| `write.parquet.bloom-filter-enabled.column.` | any column set to `true` | **iceberg-rust**: bloom-filter sizing diverges from iceberg-java (same root cause as above — no cap clamping) | +| `write.parquet.row-group-check-min-record-count` | set to a non-default value (default `100`) | **parquet-rs**: row-group close cadence is byte-driven and doesn't expose parquet-mr's per-row-count knobs, so iceberg-rust can't honour this property | +| `write.parquet.row-group-check-max-record-count` | set to a non-default value (default `10000`) | **parquet-rs**: same as above | +| `write.parquet.stats-enabled.column.` | any column override set **and** the running Iceberg version defines `TableProperties.PARQUET_COLUMN_STATS_ENABLED_PREFIX` (1.10.0+) | **parquet-rs**: no per-column statistics-enabled override matching parquet-mr's. The property was only added in Iceberg 1.10.0; on 1.5.2 / 1.8.1 Iceberg-Java ignores it too, so there's nothing to diverge from and the write is **not** gated on those versions. | +| `parquet.enable.dictionary` | set | **parquet-rs**: explicit dictionary override is not translated to the native writer settings; parquet-rs would fall back to its own default and diverge from parquet-mr. | +| `write.metadata.metrics.max-inferred-column-defaults` | the number of projected field IDs exceeds this limit (default `100`) **and** no explicit `write.metadata.metrics.default` is set | **iceberg-rust**: doesn't apply `MetricsModes.None` to columns past this limit. Iceberg only applies that inferred-column truncation when no default mode is configured (a user-set default applies to all columns regardless of count), so the gate is skipped once a default is present. | + +> `write.parquet.page-version` is intentionally **not** a fallback trigger: no Iceberg version Comet +> targets (1.5.2 / 1.8.1 / 1.10.0) maps a table property to the Parquet writer version — it is +> hardwired to `PARQUET_1_0`. Setting it is a no-op in Iceberg-Java exactly as in the native path. +> | `io-impl` (catalog property) | set | **iceberg-rust**: native FileIO is selected by URI scheme (`OpenDalStorageFactory`), so an explicit Java `FileIO` class can't be honoured. | +> | Resolved data-location URI scheme | not in `{file, memory, s3, s3a, gs, oss}` | **iceberg-rust**: `iceberg_common::storage_factory_for` resolves only the listed schemes; `hdfs`, `abfs`/`abfss`, `wasb`/`wasbs`, and similar would crash at write time. | + +Every trigger has a pinning test in `CometIcebergWriteDetectionSuite`. + +## Known limitations and planned work + +- **CoW MERGE — native acceleration planned.** The split-operator plan already applies (scan, + join, sort, shuffle run native); only the per-row merge dispatch (`MergeRowsExec`) and the + file write itself stay JVM. The path forward is well-understood: add a `CometMergeRowsExec` + that mirrors Spark's instruction-driven dispatch model (`Keep` / `Discard` / `Split` over + matched / not-matched / not-matched-by-source clauses, plus a bitmap-backed cardinality + check). Tracked. + +- **`sort_order_id` is always stamped `0` (unsorted).** This matches Iceberg-Java for every + version Comet targets: the Spark `SparkWrite$WriterFactory` in Iceberg 1.5.2 / 1.8.1 / 1.10.0 + builds `SparkFileWriterFactory` **without** `.dataSortOrder(...)`, so a stock batch append stamps + `sort_order_id = 0` on committed data files even for a table that has a non-default sort order. + The native writer does the same. (Iceberg 1.11+ adds + `SparkWriteConf.outputSortOrderId(writeRequirements)` and wires the resolved id into the writer + factory; Comet reflects that resolver when present, so behaviour stays correct if the pinned + runtime is bumped.) + +- **Nested (list/map) column bounds differ from Iceberg-Java.** iceberg-java's + `ParquetUtil.shouldStoreBounds` suppresses `lower_bounds` / `upper_bounds` for any field reached + through a repeated (list or map) field, so collection-element primitives get no manifest bounds. + iceberg-rust's `MinMaxColAggregator` computes bounds for every leaf primitive — including + list-element / map-key / map-value fields — so a native write to a schema containing list or map + columns produces **extra** `lower_bounds` / `upper_bounds` entries for those nested field IDs that + Iceberg-Java would omit. `column_sizes` / `value_counts` / `null_value_counts` match. The extra + bounds are not used by Iceberg's metrics evaluators (predicates can't address list/map elements), + so the functional risk is low, but the manifest is not byte-identical to the JVM path for nested + schemas. Strict parity would require stripping those nested-field bounds before manifest encoding + (or in iceberg-rust); not yet done. + +## Tests + +- **`CometIcebergWriteActionSuite`** — end-to-end scenarios against a temporary Hadoop catalog + covering the copy-on-write V2 logical-write variants on both the split-operator-only and + full-native paths, plus parity vs Iceberg-Java, the disabled-config fallback, the + commit-once invariant, and the documented native-engagement-vs-fallback contract for MERGE. + Passes on Spark 3.4 / 3.5 / 4.0 with `IcebergSparkSessionExtensions` loaded. +- **`CometIcebergWriteDetectionSuite`** — tests that build a planned-but-not-executed + `IcebergWriteExec` per fall-back trigger and assert + `CometIcebergNativeWrite.getSupportLevel` returns `Unsupported` with the expected reason + fragment, plus a positive baseline (clean parquet V2 table → `Compatible`). +- **`IcebergWriteProtoTranslationSuite`** — unit tests pinning the JVM-side property → proto + translation table-by-table. + +## Abort behaviour + +The writer calls `writer.abort()` per task on failure to release task-level resources. The +committer (`IcebergCommitExec.run()`) calls `BatchWrite.abort(messages)` if `commit()` raises. + +If task writers stage files locally but their commit messages never reach the driver (e.g. +driver crash mid-collect), the staged Parquet files become **unreferenced orphans**. Iceberg's +catalog-level `RemoveOrphanFiles` action reaps these on the next maintenance run. Schedule +`RemoveOrphanFiles` if you want to avoid storage drift on failed writes. This diverges from +Spark's per-task abort behaviour and is a deliberate trade-off in favour of a simpler +driver-side commit loop. + +--- + +# Developer notes + +The sections below document architectural decisions, interactions with Spark internals, and +load-bearing implementation details. They're not required reading for _using_ the writer — only +for working on it. + +## How the split survives AQE re-planning + +AQE calls `reOptimize(inputPlan.logicalLink.get)` every time a query stage materialises and +re-runs the planner on the result. The trick is making sure the writer stays in +place and the commit stays exactly one — not zero, not two. + +Two design choices keep this stable: + +1. **File writer exec uses a stable logical anchor.** `IcebergWriteLogical` is a + Catalyst `UnaryNode` that wraps the data sub-query plus the `BatchWrite` reference. The + planner's `setLogicalLink` pins the writer to this anchor (not to the surrounding + `ReplaceData` / `AppendData` / etc. logical node) so AQE's `reOptimize` re-plans only the + data query. Without the anchor AQE would either re-fire the surrounding Iceberg write + logical node (duplicating the commit) or strip the writer away (no write + happens). +2. **The `BatchWrite` instance is shared between committer and writer.** Iceberg's + `Write.toBatch()` returns a freshly-constructed `BatchWrite` on every call; for CoW DML the + instance holds the scan state and emitted-file tracking, so the writer's writer + factory and the committer **must** operate on the same instance for `OverwriteFiles`' + serialisable-isolation validation to walk the right snapshot range. The strategy calls + `toBatch` once and threads the result through both execs. + +## Other AQE / planning interactions + +- **AQE re-fire produces a fresh `IcebergWriteExec` over the existing `CometIcebergWriteExec`.** + Without protection this would stack two writers. `CometExecRule`'s `IcebergWriteExec` case + detects this (via `unwrapToCometIcebergWrite`) and returns the existing writer instead — + mirroring the `DataWritingCommandExec(WriteFilesExec(CometNativeWriteExec))` unwrap V1 + parquet uses. +- **`CometIcebergWriteExec` is a block boundary in the serialisation loop.** The post-convert + serialization loop in `CometExecRule` resets `firstNativeOp = true` after + `CometIcebergWriteExec` (alongside `CometNativeWriteExec`) so the upstream Comet chain's + outermost node gets `convertBlock()` called on it and populates its `serializedPlanOpt`. + Without this, an upstream `CometProject` reaches execution without a serialised plan and + `doExecuteColumnar` throws. +- **`CometMetricNode.fromCometPlan` stops at non-Comet descendants.** AQE-stage nodes + (`AQEShuffleReadExec`) can have a null `session` field when constructed off the planning + thread; their lazy `metrics` val NPEs on `sparkContext`. They're not ours to update anyway, + so the metric tree walk just stops at the boundary. + +## Native acceleration internals + +The high-level shape: the JVM-side serde marshals everything iceberg-rust needs (table +properties, write schema, partition spec, target file size, writer mode, per-task IDs, …) into +a single proto, and on each task iceberg-rust returns one `BINARY` row containing an Iceberg V2 +data manifest (Avro container bytes). The JVM-side commit decodes those bytes back into +`DataFile`s via `ManifestFiles.read` and feeds them to `BatchWrite.commit(messages)`. So +everything iceberg-java would do post-write (snapshot id assignment, manifest list aggregation, +commit retries, cache refresh) is untouched — only the per-task parquet emission moved. + +Notable details: + +- **Per-task IDs are stamped onto the proto.** `CometIcebergWriteExec.doExecuteColumnar` reads + `TaskContext.getPartitionId()` and `taskAttemptId()` and rebuilds the `IcebergWrite` proto's + `partition_id` / `task_attempt_id` per task before serialising. Without this every task mints + the same filename prefix (`00000-00000-`) and OpenDAL's `CompleteWriter` trips on the + resulting concurrent-write byte-count drift. Mirrors `CometNativeWriteExec`'s parquet pattern. +- **Spark 4.x `ReplaceData` carries extra columns** that the native writer can't pass through + as-is; `CometIcebergNativeWrite.maybeProjectReplaceDataColumns` splices a DataFusion + `Projection` between the FFI scan and `IcebergWrite` to strip them. The method docstring + has the full mechanics. + +## Iceberg 1.5.2 (Spark 3.4) reflection skew + +Iceberg 1.5.2 differs in a handful of small ways from 1.8+. All of them are papered over with +reflection in `IcebergReflection` rather than per-version source shims: + +- **`useFanoutWriter` field renamed.** 1.5.2 calls it `partitionedFanoutEnabled`. The + `getUseFanoutWriterFromSparkWrite` helper tries the new name first and falls back to the old. +- **`GenericManifestFile` constructor.** Iceberg 1.6 added a 3-arg form + `(InputFile, int, long)` that takes V3's `firstRowId`. 1.5.2 only has the 2-arg form + `(InputFile, int)`. `newDataManifestFile` tries the 3-arg form first and falls back. +- **Null `snapshotId` rejected on read.** Iceberg 1.5.2's + `InheritableMetadataFactory.fromManifest` throws "Cannot read from ManifestFile with null + (unassigned) snapshot ID". 1.8+ relaxed that check. The helper reflectively writes a `0L` + placeholder onto `GenericManifestFile.snapshotId` after construction; it's overwritten by + `BatchWrite.commit` later and never reaches storage. +- **Separate logical write node.** Iceberg 1.5.2 ships its own `ReplaceIcebergData` logical + node because Spark 3.4 lacks native row-level operation support. Field shape is identical to + Spark 3.5+'s stock `ReplaceData`, so `IcebergWriteStrategy` matches it by FQCN and the + extracted tuple reuses the existing dispatcher. + +## Why a custom logical anchor (`IcebergWriteLogical`) + +Without `IcebergWriteLogical`, AQE's `reOptimize` would either: + +- **Vanish** the writer, if its logical link points at the data sub-query (which + carries no write semantics). The re-planned tree would have no writer at all. +- **Duplicate** the committer, if the writer's logical link points at the + surrounding logical write (`ReplaceData` / `AppendData` / …). The strategy would re-fire the + entire two-op tree on every AQE iteration, each iteration emitting a fresh + `IcebergCommitExec` — and Iceberg's `OverwriteFiles` validation then sees the prior + iteration's file as a newly-added conflicting file and fails. + +`IcebergWriteLogical` sits between the two and lets the strategy re-emit a fresh +`IcebergWriteExec` on every iteration without touching the committer. diff --git a/docs/source/user-guide/latest/index.rst b/docs/source/user-guide/latest/index.rst index 00f770c27e..5c8995224a 100644 --- a/docs/source/user-guide/latest/index.rst +++ b/docs/source/user-guide/latest/index.rst @@ -79,6 +79,7 @@ to read more. :hidden: Iceberg Guide + Iceberg Writes S3 Credential Providers Kubernetes Guide diff --git a/native/core/src/cloud/s3/credential_bridge.rs b/native/core/src/cloud/s3/credential_bridge.rs index 14e4bc49d9..9fc5b562cb 100644 --- a/native/core/src/cloud/s3/credential_bridge.rs +++ b/native/core/src/cloud/s3/credential_bridge.rs @@ -53,7 +53,6 @@ static WARNED_MISSING_EXPIRY: OnceCell<()> = OnceCell::new(); #[derive(Debug, Clone, Copy)] pub enum AccessMode { Read = 0, - #[allow(dead_code)] Write = 1, } diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index ac223a462b..0c2bc5604f 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -236,6 +236,7 @@ fn op_name(op: &OpStruct) -> &'static str { OpStruct::Window(_) => "Window", OpStruct::NativeScan(_) => "NativeScan", OpStruct::IcebergScan(_) => "IcebergScan", + OpStruct::IcebergWrite(_) => "IcebergWrite", OpStruct::ParquetWriter(_) => "ParquetWriter", OpStruct::Explode(_) => "Explode", OpStruct::CsvScan(_) => "CsvScan", diff --git a/native/core/src/execution/operators/iceberg_common.rs b/native/core/src/execution/operators/iceberg_common.rs new file mode 100644 index 0000000000..bcb5ee6b57 --- /dev/null +++ b/native/core/src/execution/operators/iceberg_common.rs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Helpers shared between the Iceberg scan and Iceberg write operators. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::common::DataFusionError; +use iceberg::io::{FileIO, FileIOBuilder, StorageFactory}; +use iceberg_storage_opendal::{CustomAwsCredentialLoader, OpenDalStorageFactory}; + +use crate::cloud::s3::credential_bridge::{AccessMode, CometS3CredentialBridge}; + +/// Activation key for the `CometS3CredentialProvider` SPI, read from a catalog's `s3.*` property +/// bag. +const ICEBERG_PROVIDER_CLASS_PROPERTY: &str = "s3.comet.credential.provider.class"; + +/// Key prefixes forwarded to iceberg-rust's `FileIO`. The full unfiltered catalog bag (catalog +/// URI, OAuth tokens, credentials.uri, tenant-id, etc.) is kept upstream so +/// `CometS3CredentialBridge` can read whatever the vendor needs. +const STORAGE_PROPERTY_PREFIXES: &[&str] = &["s3.", "gcs.", "adls.", "client."]; + +/// Pick an OpenDAL storage backend from a URI's scheme. `file` (or no scheme) falls through to +/// the local file system. `memory` is used by the write path to assemble manifest bytes that +/// stay entirely in-process. For S3, the Comet credential bridge is wired in when a provider +/// class is configured; `access_mode` is forwarded to the JVM SPI so the read and write paths can +/// be granted different (e.g. read-only vs read-write) credentials. +pub(crate) fn storage_factory_for( + path: &str, + catalog_properties: &HashMap, + catalog_name: &str, + access_mode: AccessMode, +) -> Result, DataFusionError> { + let scheme = if path.contains("://") { + path.split("://").next().unwrap_or("file") + } else { + "file" + }; + match scheme { + "file" => Ok(Arc::new(OpenDalStorageFactory::Fs)), + "memory" => Ok(Arc::new(OpenDalStorageFactory::Memory)), + "s3" | "s3a" => { + let customized_credential_load = + build_s3_credential_loader(path, catalog_properties, catalog_name, access_mode); + Ok(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load, + })) + } + "gs" => Ok(Arc::new(OpenDalStorageFactory::Gcs)), + "oss" => Ok(Arc::new(OpenDalStorageFactory::Oss)), + _ => Err(DataFusionError::Execution(format!( + "Unsupported storage scheme: {scheme}" + ))), + } +} + +/// Build a `FileIO` whose storage scheme is inferred from `reference_path` and whose properties +/// come from the catalog. The reference path is the metadata location for reads or the data +/// location for writes — anything that carries the right URI scheme. `catalog_name` is the +/// credential dispatch key and `access_mode` is the access intent forwarded to the S3 credential +/// bridge, so the write path can request write-capable credentials. +pub(crate) fn load_file_io( + catalog_properties: &HashMap, + reference_path: &str, + catalog_name: &str, + access_mode: AccessMode, +) -> Result { + let factory = storage_factory_for( + reference_path, + catalog_properties, + catalog_name, + access_mode, + )?; + let mut file_io_builder = FileIOBuilder::new(factory); + + // Narrow to storage-prefix keys before forwarding to iceberg-rust's FileIO. The full + // unfiltered bag (catalog URI, OAuth tokens, credentials.uri, tenant-id, etc.) is kept + // upstream so CometS3CredentialBridge can read whatever the vendor needs. + for (key, value) in catalog_properties { + if STORAGE_PROPERTY_PREFIXES.iter().any(|p| key.starts_with(p)) { + file_io_builder = file_io_builder.with_prop(key, value); + } + } + + Ok(file_io_builder.build()) +} + +/// Wires the configured Comet credential provider into opendal's S3 service, or returns `None` +/// so opendal falls back to its default credential chain. +fn build_s3_credential_loader( + reference_path: &str, + catalog_properties: &HashMap, + catalog_name: &str, + access_mode: AccessMode, +) -> Option { + let url = url::Url::parse(reference_path).ok()?; + let bucket = url.host_str()?; + let provider_class = catalog_properties + .get(ICEBERG_PROVIDER_CLASS_PROPERTY) + .map(|s| s.trim()) + .filter(|s| !s.is_empty())?; + // Fall back to the bucket when the table has no catalog identity (e.g. HadoopTables loaded by + // raw path). + let dispatch_key: &str = if catalog_name.is_empty() { + bucket + } else { + catalog_name + }; + let bridge = CometS3CredentialBridge::new( + provider_class, + dispatch_key, + bucket, + url.path(), + access_mode, + catalog_properties, + ); + match bridge { + Ok(b) => Some(CustomAwsCredentialLoader::new(b)), + Err(e) => { + log::warn!( + "Failed to initialize CometS3CredentialBridge for {provider_class}: {e}; \ + falling back to default opendal credential chain" + ); + None + } + } +} diff --git a/native/core/src/execution/operators/iceberg_scan.rs b/native/core/src/execution/operators/iceberg_scan.rs index ce1388f78f..f9568b630a 100644 --- a/native/core/src/execution/operators/iceberg_scan.rs +++ b/native/core/src/execution/operators/iceberg_scan.rs @@ -39,11 +39,9 @@ use datafusion::physical_plan::{ }; use futures::{Stream, StreamExt, TryStreamExt}; use iceberg::arrow::ScanMetrics; -use iceberg::io::{FileIO, FileIOBuilder, StorageFactory}; -use iceberg_storage_opendal::CustomAwsCredentialLoader; -use iceberg_storage_opendal::OpenDalStorageFactory; -use crate::cloud::s3::credential_bridge::{AccessMode, CometS3CredentialBridge}; +use crate::cloud::s3::credential_bridge::AccessMode; +use crate::execution::operators::iceberg_common::load_file_io; use crate::execution::operators::ExecutionError; use crate::parquet::parquet_support::SparkParquetOptions; use crate::parquet::schema_adapter::SparkPhysicalExprAdapterFactory; @@ -51,10 +49,6 @@ use datafusion_comet_spark_expr::EvalMode; use datafusion_physical_expr_adapter::{PhysicalExprAdapter, PhysicalExprAdapterFactory}; use iceberg::scan::FileScanTask; -/// Activation key for the `CometS3CredentialProvider` SPI on the Iceberg path, read from a Spark -/// catalog's `s3.*` property bag. -const ICEBERG_PROVIDER_CLASS_PROPERTY: &str = "s3.comet.credential.provider.class"; - /// Iceberg table scan operator that uses iceberg-rust to read Iceberg tables. /// /// Executes pre-planned FileScanTasks for efficient parallel scanning. @@ -166,10 +160,11 @@ impl IcebergScanExec { context: Arc, ) -> DFResult { let output_schema = Arc::clone(&self.output_schema); - let file_io = Self::load_file_io( + let file_io = load_file_io( &self.catalog_properties, &self.metadata_location, &self.catalog_name, + AccessMode::Read, )?; let batch_size = context.session_config().batch_size(); @@ -214,96 +209,6 @@ impl IcebergScanExec { Ok(Box::pin(wrapped_stream)) } - - fn storage_factory_for( - path: &str, - catalog_properties: &HashMap, - catalog_name: &str, - ) -> Result, DataFusionError> { - let scheme = if path.contains("://") { - path.split("://").next().unwrap_or("file") - } else { - "file" - }; - match scheme { - "file" => Ok(Arc::new(OpenDalStorageFactory::Fs)), - "s3" | "s3a" => { - let customized_credential_load = - build_s3_credential_loader(path, catalog_properties, catalog_name); - Ok(Arc::new(OpenDalStorageFactory::S3 { - customized_credential_load, - })) - } - "gs" => Ok(Arc::new(OpenDalStorageFactory::Gcs)), - "oss" => Ok(Arc::new(OpenDalStorageFactory::Oss)), - _ => Err(DataFusionError::Execution(format!( - "Unsupported storage scheme: {scheme}" - ))), - } - } - - fn load_file_io( - catalog_properties: &HashMap, - metadata_location: &str, - catalog_name: &str, - ) -> Result { - let factory = - Self::storage_factory_for(metadata_location, catalog_properties, catalog_name)?; - let mut file_io_builder = FileIOBuilder::new(factory); - - // Narrow to storage-prefix keys before forwarding to iceberg-rust's FileIO. The full - // unfiltered bag (catalog URI, OAuth tokens, credentials.uri, tenant-id, etc.) is kept - // upstream so CometS3CredentialBridge can read whatever the vendor needs. - for (key, value) in catalog_properties { - if STORAGE_PROPERTY_PREFIXES.iter().any(|p| key.starts_with(p)) { - file_io_builder = file_io_builder.with_prop(key, value); - } - } - - Ok(file_io_builder.build()) - } -} - -const STORAGE_PROPERTY_PREFIXES: &[&str] = &["s3.", "gcs.", "adls.", "client."]; - -/// Wires the configured Comet credential provider into opendal's S3 service, or returns `None` -/// so opendal falls back to its default credential chain. -fn build_s3_credential_loader( - metadata_location: &str, - catalog_properties: &HashMap, - catalog_name: &str, -) -> Option { - let url = url::Url::parse(metadata_location).ok()?; - let bucket = url.host_str()?; - let provider_class = catalog_properties - .get(ICEBERG_PROVIDER_CLASS_PROPERTY) - .map(|s| s.trim()) - .filter(|s| !s.is_empty())?; - // Fall back to the bucket when the table has no catalog identity (e.g. HadoopTables loaded by - // raw path). - let dispatch_key: &str = if catalog_name.is_empty() { - bucket - } else { - catalog_name - }; - let bridge = CometS3CredentialBridge::new( - provider_class, - dispatch_key, - bucket, - url.path(), - AccessMode::Read, - catalog_properties, - ); - match bridge { - Ok(b) => Some(CustomAwsCredentialLoader::new(b)), - Err(e) => { - log::warn!( - "Failed to initialize CometS3CredentialBridge for {provider_class}: {e}; \ - falling back to default opendal credential chain" - ); - None - } - } } /// Metrics for IcebergScanExec diff --git a/native/core/src/execution/operators/iceberg_write.rs b/native/core/src/execution/operators/iceberg_write.rs new file mode 100644 index 0000000000..377a18149d --- /dev/null +++ b/native/core/src/execution/operators/iceberg_write.rs @@ -0,0 +1,1115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Native Iceberg write operator using iceberg-rust. +//! +//! Drains the upstream Arrow stream through iceberg-rust's writer stack +//! (`ParquetWriterBuilder` -> `RollingFileWriterBuilder` -> `DataFileWriterBuilder` +//! -> `Unpartitioned`/`Fanout`/`Clustered`Writer) and emits a single-row, single-column +//! Arrow batch carrying the `Vec` produced for the task, packed as an Iceberg V2 +//! data manifest via iceberg-rust's `ManifestWriter` against an in-memory `FileIO`. The JVM +//! decodes the bytes with `ManifestFiles.read(...)` to recover the `DataFile`s for commit. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::array::{ArrayRef, BinaryArray, RecordBatch}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, + PlanProperties, SendableRecordBatchStream, +}; +use futures::TryStreamExt; +use iceberg::arrow::RecordBatchPartitionSplitter; +use iceberg::spec::{ + DataFile, DataFileFormat, ManifestWriterBuilder, PartitionSpec, PartitionSpecRef, + Schema as IcebergSchema, SchemaRef as IcebergSchemaRef, +}; +use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; +use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, +}; +use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; +use iceberg::writer::file_writer::ParquetWriterBuilder; +use iceberg::writer::partitioning::clustered_writer::ClusteredWriter; +use iceberg::writer::partitioning::fanout_writer::FanoutWriter; +use iceberg::writer::partitioning::unpartitioned_writer::UnpartitionedWriter; +#[cfg(test)] +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; +use parquet::file::properties::{EnabledStatistics, WriterProperties}; +use parquet::schema::types::ColumnPath; + +use datafusion_comet_proto::spark_operator::{ + CompressionCodec as ProtoCompressionCodec, IcebergParquetColumnStatistics, + IcebergParquetWriteSettings, IcebergWrite, IcebergWriteCommon, + IcebergWriterMode as ProtoIcebergWriterMode, +}; + +use crate::cloud::s3::credential_bridge::AccessMode; +use crate::execution::operators::iceberg_common::load_file_io; + +/// Builder chain instantiated once per task and handed to the partitioning wrapper. +type IcebergDataFileWriterBuilder = + DataFileWriterBuilder; + +/// Native Iceberg write operator. Owns the parsed Iceberg schema/spec and the parquet writer +/// properties; at task execution it builds the iceberg-rust writer stack, drains the upstream +/// Arrow stream into it, and emits a single Avro-encoded `Vec` row. +pub struct IcebergWriteExec { + input: Arc, + common: Arc, + iceberg_schema: IcebergSchemaRef, + partition_spec: PartitionSpecRef, + writer_mode: ProtoIcebergWriterMode, + writer_properties: Arc, + partition_id: Option, + task_attempt_id: Option, + output_schema: SchemaRef, + plan_properties: Arc, + metrics: ExecutionPlanMetricsSet, +} + +impl IcebergWriteExec { + pub fn try_new(input: Arc, proto: IcebergWrite) -> DFResult { + let IcebergWrite { + common, + partition_id, + task_attempt_id, + } = proto; + let common = common.ok_or_else(|| { + DataFusionError::Internal("IcebergWrite missing common payload".into()) + })?; + let settings = common.parquet_settings.as_ref().ok_or_else(|| { + DataFusionError::Internal("IcebergWriteCommon missing parquet_settings".into()) + })?; + let writer_properties = build_writer_properties(settings)?; + let iceberg_schema = parse_iceberg_schema(&common.iceberg_schema_json)?; + let partition_spec = parse_partition_spec(&common.partition_spec_json)?; + let writer_mode = ProtoIcebergWriterMode::try_from(common.writer_mode).map_err(|_| { + DataFusionError::Internal(format!( + "Unknown IcebergWriterMode proto value: {}", + common.writer_mode + )) + })?; + let output_schema = build_output_schema(); + let plan_properties = Self::compute_properties(&input, Arc::clone(&output_schema)); + Ok(Self { + input, + common: Arc::new(common), + iceberg_schema, + partition_spec, + writer_mode, + writer_properties: Arc::new(writer_properties), + partition_id, + task_attempt_id, + output_schema, + plan_properties, + metrics: ExecutionPlanMetricsSet::new(), + }) + } + + fn compute_properties( + input: &Arc, + schema: SchemaRef, + ) -> Arc { + Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(input.output_partitioning().partition_count()), + EmissionType::Final, + Boundedness::Bounded, + )) + } +} + +impl ExecutionPlan for IcebergWriteExec { + fn name(&self) -> &str { + "IcebergWriteExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.output_schema) + } + + fn properties(&self) -> &Arc { + &self.plan_properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + mut children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "IcebergWriteExec requires exactly one child".into(), + )); + } + Ok(Arc::new(Self { + input: children.pop().unwrap(), + common: Arc::clone(&self.common), + iceberg_schema: Arc::clone(&self.iceberg_schema), + partition_spec: Arc::clone(&self.partition_spec), + writer_mode: self.writer_mode, + writer_properties: Arc::clone(&self.writer_properties), + partition_id: self.partition_id, + task_attempt_id: self.task_attempt_id, + output_schema: Arc::clone(&self.output_schema), + plan_properties: Arc::clone(&self.plan_properties), + metrics: self.metrics.clone(), + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + let common = Arc::clone(&self.common); + let iceberg_schema = Arc::clone(&self.iceberg_schema); + let partition_spec = Arc::clone(&self.partition_spec); + let writer_mode = self.writer_mode; + let writer_properties = Arc::clone(&self.writer_properties); + let partition_id = self.partition_id; + let task_attempt_id = self.task_attempt_id; + let output_schema = Arc::clone(&self.output_schema); + + let task = async move { + let data_files = run_write_task( + input_stream, + Arc::clone(&common), + Arc::clone(&iceberg_schema), + Arc::clone(&partition_spec), + writer_mode, + writer_properties.as_ref().clone(), + partition_id, + task_attempt_id, + ) + .await?; + let manifest_bytes = encode_data_files_as_manifest( + data_files, + iceberg_schema, + partition_spec, + partition_id, + task_attempt_id, + &common.operation_id, + ) + .await?; + let batch = build_output_batch(manifest_bytes, &output_schema)?; + Ok::<_, DataFusionError>(futures::stream::iter(vec![Ok(batch)])) + }; + + Ok(Box::pin(RecordBatchStreamAdapter::new( + Arc::clone(&self.output_schema), + futures::stream::once(task).try_flatten(), + ))) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } +} + +impl fmt::Debug for IcebergWriteExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("IcebergWriteExec") + .field("metadata_location", &self.common.metadata_location) + .field("data_location", &self.common.data_location) + .field("operation_id", &self.common.operation_id) + .field("writer_mode", &self.writer_mode) + .field("partition_id", &self.partition_id) + .field("task_attempt_id", &self.task_attempt_id) + .finish() + } +} + +impl DisplayAs for IcebergWriteExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "IcebergWriteExec: metadata_location={}, data_location={}, operation_id={}", + self.common.metadata_location, self.common.data_location, self.common.operation_id + ) + } +} + +/// One-shot per-task write coroutine. Builds the iceberg-rust writer stack, decorates each input +/// batch with `PARQUET_FIELD_ID_META_KEY` metadata so iceberg-rust can match Arrow columns to +/// Iceberg field IDs, and routes through `UnpartitionedWriter`/`FanoutWriter`/`ClusteredWriter` +/// depending on `writer_mode`. +#[allow(clippy::too_many_arguments)] +async fn run_write_task( + mut input: SendableRecordBatchStream, + common: Arc, + iceberg_schema: IcebergSchemaRef, + partition_spec: PartitionSpecRef, + writer_mode: ProtoIcebergWriterMode, + writer_properties: WriterProperties, + partition_id: Option, + task_attempt_id: Option, +) -> DFResult> { + let catalog_properties = common + .catalog_properties + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + let file_io = load_file_io( + &catalog_properties, + &common.data_location, + &common.catalog_name, + AccessMode::Write, + )?; + + let location_generator = + DefaultLocationGenerator::with_data_location(common.data_location.clone()); + let file_name_generator = DefaultFileNameGenerator::new( + file_name_prefix(partition_id, task_attempt_id, &common.operation_id), + None, + DataFileFormat::Parquet, + ); + let parquet_builder = ParquetWriterBuilder::new(writer_properties, Arc::clone(&iceberg_schema)); + let rolling_builder = RollingFileWriterBuilder::new( + parquet_builder, + common.target_file_size_bytes as usize, + file_io, + location_generator, + file_name_generator, + ); + let data_file_builder = DataFileWriterBuilder::new(rolling_builder); + + let unpartitioned = partition_spec.is_unpartitioned(); + let mut writer = match (unpartitioned, writer_mode) { + (true, _) => InnerWriter::Unpartitioned(UnpartitionedWriter::new(data_file_builder)), + (false, ProtoIcebergWriterMode::IcebergWriterFanout) => { + InnerWriter::Fanout(FanoutWriter::new(data_file_builder)) + } + (false, _) => InnerWriter::Clustered(ClusteredWriter::new(data_file_builder)), + }; + + let splitter = if unpartitioned { + None + } else { + Some( + RecordBatchPartitionSplitter::try_new_with_computed_values( + Arc::clone(&iceberg_schema), + Arc::clone(&partition_spec), + ) + .map_err(iceberg_err)?, + ) + }; + + // Build the field-id-decorated target schema once per task; every batch is cast against it. + let target_schema = + Arc::new(iceberg::arrow::schema_to_arrow_schema(&iceberg_schema).map_err(iceberg_err)?); + while let Some(batch) = input.try_next().await? { + let decorated = decorate_batch_with_field_ids(batch, &target_schema)?; + writer.write(decorated, splitter.as_ref()).await?; + } + writer.close().await +} + +/// Enum-based dispatch over the three iceberg-rust partitioning writers. Each variant takes the +/// same builder chain so we can keep the type fixed. +enum InnerWriter { + Unpartitioned(UnpartitionedWriter), + Fanout(FanoutWriter), + Clustered(ClusteredWriter), +} + +impl InnerWriter { + async fn write( + &mut self, + batch: RecordBatch, + splitter: Option<&RecordBatchPartitionSplitter>, + ) -> DFResult<()> { + use iceberg::writer::partitioning::PartitioningWriter; + match self { + InnerWriter::Unpartitioned(w) => w.write(batch).await.map_err(iceberg_err), + InnerWriter::Fanout(w) => { + let parts = splitter + .expect("partition splitter must be Some for partitioned writes") + .split(&batch) + .map_err(iceberg_err)?; + for (key, part) in parts { + w.write(key, part).await.map_err(iceberg_err)?; + } + Ok(()) + } + InnerWriter::Clustered(w) => { + let parts = splitter + .expect("partition splitter must be Some for partitioned writes") + .split(&batch) + .map_err(iceberg_err)?; + for (key, part) in parts { + w.write(key, part).await.map_err(iceberg_err)?; + } + Ok(()) + } + } + } + + async fn close(self) -> DFResult> { + use iceberg::writer::partitioning::PartitioningWriter; + match self { + InnerWriter::Unpartitioned(w) => w.close().await.map_err(iceberg_err), + InnerWriter::Fanout(w) => w.close().await.map_err(iceberg_err), + InnerWriter::Clustered(w) => w.close().await.map_err(iceberg_err), + } + } +} + +// --- helpers ------------------------------------------------------------- + +fn parse_iceberg_schema(json: &str) -> DFResult { + let schema: IcebergSchema = serde_json::from_str(json).map_err(|e| { + DataFusionError::Internal(format!("Failed to parse iceberg schema JSON: {e}")) + })?; + Ok(Arc::new(schema)) +} + +fn parse_partition_spec(json: &str) -> DFResult { + let spec: PartitionSpec = serde_json::from_str(json).map_err(|e| { + DataFusionError::Internal(format!("Failed to parse partition spec JSON: {e}")) + })?; + Ok(Arc::new(spec)) +} + +fn iceberg_err(e: iceberg::Error) -> DataFusionError { + DataFusionError::External(Box::new(e)) +} + +fn build_output_schema() -> SchemaRef { + Arc::new(ArrowSchema::new(vec![Field::new( + "iceberg_manifest", + DataType::Binary, + false, + )])) +} + +/// Align an input batch with the field-id-decorated target schema by casting each column. The +/// caller is responsible for building `target_schema` once per task via +/// `iceberg::arrow::schema_to_arrow_schema` — it carries `PARQUET_FIELD_ID_META_KEY` on every +/// nested field, and `arrow::compute::cast` rebuilds the column structure to match while +/// reusing data buffers. This is the same conformance step the iceberg-rust DataFusion +/// integration gets for free from DataFusion's `INSERT INTO` planner. +fn decorate_batch_with_field_ids( + batch: RecordBatch, + target_schema: &SchemaRef, +) -> DFResult { + if batch.num_columns() != target_schema.fields().len() { + return Err(DataFusionError::Plan(format!( + "Iceberg write column count mismatch: arrow batch has {} columns but schema has {}", + batch.num_columns(), + target_schema.fields().len() + ))); + } + let casted: Vec = batch + .columns() + .iter() + .zip(target_schema.fields().iter()) + .map(|(col, target)| arrow::compute::cast(col, target.data_type())) + .collect::>() + .map_err(DataFusionError::from)?; + RecordBatch::try_new(Arc::clone(target_schema), casted).map_err(DataFusionError::from) +} + +fn file_name_prefix( + partition_id: Option, + task_attempt_id: Option, + operation_id: &str, +) -> String { + format!( + "{:05}-{:05}-{}", + partition_id.unwrap_or(0), + task_attempt_id.unwrap_or(0), + operation_id + ) +} + +/// Serialise the produced data files as an in-memory Iceberg V2 data manifest, then read the +/// manifest bytes back out. The JVM side decodes these bytes with `ManifestFiles.read(...)` to +/// recover the `DataFile`s. +/// +/// The manifest entries carry a placeholder `snapshot_id` (`None` -> `UNASSIGNED_SNAPSHOT_ID = +/// -1`) and a placeholder `sequence_number` of `0`. Neither is meaningful here: the JVM ignores +/// the entry-level fields and only consumes the embedded `DataFile`s, which the driver later +/// re-stamps with the real snapshot id during `BatchWrite.commit`. +async fn encode_data_files_as_manifest( + data_files: Vec, + iceberg_schema: IcebergSchemaRef, + partition_spec: PartitionSpecRef, + partition_id: Option, + task_attempt_id: Option, + operation_id: &str, +) -> DFResult> { + // The manifest is assembled entirely in-process via the `memory` scheme, so the credential + // dispatch key / access mode are inert here. + let memory_io = load_file_io( + &std::collections::HashMap::new(), + "memory:///", + "", + AccessMode::Write, + )?; + let path = format!( + "memory:///comet-manifest-{:05}-{:05}-{}.avro", + partition_id.unwrap_or(0), + task_attempt_id.unwrap_or(0), + operation_id, + ); + let output_file = memory_io.new_output(&path).map_err(iceberg_err)?; + let mut manifest_writer = ManifestWriterBuilder::new( + output_file, + None, + None, + iceberg_schema, + (*partition_spec).clone(), + ) + .build_v2_data(); + for data_file in data_files { + manifest_writer + .add_file(data_file, 0) + .map_err(iceberg_err)?; + } + manifest_writer + .write_manifest_file() + .await + .map_err(iceberg_err)?; + let bytes = memory_io + .new_input(&path) + .map_err(iceberg_err)? + .read() + .await + .map_err(iceberg_err)?; + Ok(bytes.to_vec()) +} + +fn build_output_batch(manifest_bytes: Vec, output_schema: &SchemaRef) -> DFResult { + let array: ArrayRef = Arc::new(BinaryArray::from(vec![manifest_bytes.as_slice()])); + RecordBatch::try_new(Arc::clone(output_schema), vec![array]).map_err(DataFusionError::from) +} + +/// Translate `IcebergParquetWriteSettings` into parquet-rs `WriterProperties`. +/// +/// Iceberg defaults are applied by the JVM-side translator; this function trusts the wire and +/// only re-applies parquet-rs-shaped settings. +fn build_writer_properties(settings: &IcebergParquetWriteSettings) -> DFResult { + let compression = compression_from_proto(settings.compression, settings.compression_level)?; + let mut builder = WriterProperties::builder() + .set_compression(compression) + .set_created_by(settings.created_by.clone()) + .set_max_row_group_bytes(Some(settings.row_group_size_bytes as usize)) + .set_data_page_size_limit(settings.page_size_bytes as usize) + .set_dictionary_page_size_limit(settings.dict_size_bytes as usize) + .set_data_page_row_count_limit(settings.page_row_limit as usize); + + builder = if settings.default_statistics_enabled { + builder + .set_statistics_enabled(EnabledStatistics::Page) + .set_statistics_truncate_length(settings.statistics_truncate_length.map(|n| n as usize)) + } else { + builder.set_statistics_enabled(EnabledStatistics::None) + }; + + for column in &settings.column_statistics { + builder = apply_column_statistics(builder, column); + } + + Ok(builder.build()) +} + +fn compression_from_proto(codec: i32, level: Option) -> DFResult { + let codec = ProtoCompressionCodec::try_from(codec).map_err(|_| { + DataFusionError::Internal(format!("Unknown CompressionCodec proto value: {codec}")) + })?; + match codec { + ProtoCompressionCodec::None => Ok(Compression::UNCOMPRESSED), + ProtoCompressionCodec::Snappy => Ok(Compression::SNAPPY), + ProtoCompressionCodec::Lz4 => Ok(Compression::LZ4), + ProtoCompressionCodec::Zstd => { + let lvl = level.unwrap_or(ZstdLevel::default().compression_level()); + let zstd = ZstdLevel::try_new(lvl).map_err(|e| { + DataFusionError::Internal(format!("Invalid zstd compression level {lvl}: {e}")) + })?; + Ok(Compression::ZSTD(zstd)) + } + ProtoCompressionCodec::Gzip => { + let lvl = match level { + Some(v) => u32::try_from(v).map_err(|_| { + DataFusionError::Internal(format!("Negative gzip compression level: {v}")) + })?, + None => GzipLevel::default().compression_level(), + }; + let gzip = GzipLevel::try_new(lvl).map_err(|e| { + DataFusionError::Internal(format!("Invalid gzip compression level {lvl}: {e}")) + })?; + Ok(Compression::GZIP(gzip)) + } + ProtoCompressionCodec::Brotli => { + let lvl = match level { + Some(v) => u32::try_from(v).map_err(|_| { + DataFusionError::Internal(format!("Negative brotli compression level: {v}")) + })?, + None => BrotliLevel::default().compression_level(), + }; + let brotli = BrotliLevel::try_new(lvl).map_err(|e| { + DataFusionError::Internal(format!("Invalid brotli compression level {lvl}: {e}")) + })?; + Ok(Compression::BROTLI(brotli)) + } + } +} + +fn apply_column_statistics( + builder: parquet::file::properties::WriterPropertiesBuilder, + column: &IcebergParquetColumnStatistics, +) -> parquet::file::properties::WriterPropertiesBuilder { + let path = ColumnPath::new(column.column.split('.').map(str::to_owned).collect()); + if column.enabled { + // Per-column truncate length is honoured at commit time by the JVM via + // `ParquetUtil.footerMetrics`; parquet-rs has no `set_column_statistics_truncate_length`. + builder.set_column_statistics_enabled(path, EnabledStatistics::Page) + } else { + builder.set_column_statistics_enabled(path, EnabledStatistics::None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_comet_proto::spark_operator::CompressionCodec as ProtoCodec; + + fn base_settings() -> IcebergParquetWriteSettings { + IcebergParquetWriteSettings { + compression: ProtoCodec::Zstd as i32, + compression_level: Some(3), + row_group_size_bytes: 128 * 1024 * 1024, + page_size_bytes: 1024 * 1024, + dict_size_bytes: 2 * 1024 * 1024, + page_row_limit: 20_000, + default_statistics_enabled: true, + statistics_truncate_length: Some(16), + column_statistics: vec![], + created_by: "Apache Iceberg (Comet test)".to_string(), + } + } + + #[test] + fn translates_default_settings_to_zstd_3() { + let props = build_writer_properties(&base_settings()).unwrap(); + assert!(matches!( + props.compression(&"any".into()), + Compression::ZSTD(_) + )); + if let Compression::ZSTD(level) = props.compression(&"any".into()) { + assert_eq!(level.compression_level(), 3); + } else { + panic!("expected zstd compression"); + } + } + + #[test] + fn translates_each_codec_to_matching_parquet_compression() { + let codecs = [ + (ProtoCodec::None, Compression::UNCOMPRESSED), + (ProtoCodec::Snappy, Compression::SNAPPY), + (ProtoCodec::Lz4, Compression::LZ4), + ]; + for (proto, expected) in codecs { + let mut settings = base_settings(); + settings.compression = proto as i32; + settings.compression_level = None; + let props = build_writer_properties(&settings).unwrap(); + assert_eq!(props.compression(&"c".into()), expected, "codec={proto:?}"); + } + } + + #[test] + fn translates_gzip_with_explicit_level() { + let mut settings = base_settings(); + settings.compression = ProtoCodec::Gzip as i32; + settings.compression_level = Some(9); + let props = build_writer_properties(&settings).unwrap(); + match props.compression(&"c".into()) { + Compression::GZIP(level) => assert_eq!(level.compression_level(), 9), + other => panic!("expected gzip, got {other:?}"), + } + } + + #[test] + fn translates_brotli_default_level_to_one() { + let mut settings = base_settings(); + settings.compression = ProtoCodec::Brotli as i32; + settings.compression_level = None; + let props = build_writer_properties(&settings).unwrap(); + match props.compression(&"c".into()) { + Compression::BROTLI(level) => assert_eq!(level.compression_level(), 1), + other => panic!("expected brotli, got {other:?}"), + } + } + + #[test] + fn translates_size_settings_to_parquet_setters() { + let mut settings = base_settings(); + settings.row_group_size_bytes = 64 * 1024 * 1024; + settings.page_size_bytes = 65_536; + settings.dict_size_bytes = 1_048_576; + settings.page_row_limit = 1_000; + let props = build_writer_properties(&settings).unwrap(); + assert_eq!(props.max_row_group_bytes(), Some(64 * 1024 * 1024)); + assert_eq!(props.data_page_size_limit(), 65_536); + assert_eq!(props.dictionary_page_size_limit(), 1_048_576); + assert_eq!(props.data_page_row_count_limit(), 1_000); + } + + #[test] + fn disables_statistics_when_default_off() { + let mut settings = base_settings(); + settings.default_statistics_enabled = false; + settings.statistics_truncate_length = None; + let props = build_writer_properties(&settings).unwrap(); + assert_eq!( + props.statistics_enabled(&"c".into()), + EnabledStatistics::None + ); + } + + #[test] + fn applies_default_truncate_length() { + let mut settings = base_settings(); + settings.statistics_truncate_length = Some(32); + let props = build_writer_properties(&settings).unwrap(); + assert_eq!(props.statistics_truncate_length(), Some(32)); + } + + #[test] + fn full_mode_removes_truncate_length() { + let mut settings = base_settings(); + settings.statistics_truncate_length = None; + let props = build_writer_properties(&settings).unwrap(); + assert_eq!(props.statistics_truncate_length(), None); + } + + #[test] + fn applies_per_column_statistics_overrides() { + let mut settings = base_settings(); + settings.column_statistics = vec![ + IcebergParquetColumnStatistics { + column: "id".to_string(), + enabled: true, + truncate_length: None, + }, + IcebergParquetColumnStatistics { + column: "payload".to_string(), + enabled: false, + truncate_length: None, + }, + ]; + let props = build_writer_properties(&settings).unwrap(); + assert_eq!( + props.statistics_enabled(&ColumnPath::new(vec!["id".to_string()])), + EnabledStatistics::Page + ); + assert_eq!( + props.statistics_enabled(&ColumnPath::new(vec!["payload".to_string()])), + EnabledStatistics::None + ); + } + + #[test] + fn nested_column_name_splits_on_dot() { + let mut settings = base_settings(); + settings.column_statistics = vec![IcebergParquetColumnStatistics { + column: "user.address.zip".to_string(), + enabled: false, + truncate_length: None, + }]; + let props = build_writer_properties(&settings).unwrap(); + let path = ColumnPath::new(vec![ + "user".to_string(), + "address".to_string(), + "zip".to_string(), + ]); + assert_eq!(props.statistics_enabled(&path), EnabledStatistics::None); + } + + #[test] + fn passes_created_by_through() { + let mut settings = base_settings(); + settings.created_by = "Apache Iceberg 1.7.1 (Comet 0.16.0)".to_string(); + let props = build_writer_properties(&settings).unwrap(); + assert_eq!(props.created_by(), "Apache Iceberg 1.7.1 (Comet 0.16.0)"); + } + + #[test] + fn rejects_unknown_codec() { + let mut settings = base_settings(); + settings.compression = 999; + let err = build_writer_properties(&settings).unwrap_err(); + assert!(format!("{err}").contains("Unknown CompressionCodec")); + } + + #[test] + fn rejects_out_of_range_zstd_level() { + let mut settings = base_settings(); + settings.compression = ProtoCodec::Zstd as i32; + settings.compression_level = Some(100); + let err = build_writer_properties(&settings).unwrap_err(); + assert!(format!("{err}").contains("zstd")); + } + + #[test] + fn build_output_schema_has_single_binary_column() { + let schema = build_output_schema(); + assert_eq!(schema.fields().len(), 1); + assert_eq!(schema.field(0).name(), "iceberg_manifest"); + assert_eq!(schema.field(0).data_type(), &DataType::Binary); + } + + #[test] + fn file_name_prefix_pads_ids() { + let prefix = file_name_prefix(Some(7), Some(42), "op-abc"); + assert_eq!(prefix, "00007-00042-op-abc"); + } + + #[test] + fn file_name_prefix_defaults_unset_ids_to_zero() { + let prefix = file_name_prefix(None, None, "x"); + assert_eq!(prefix, "00000-00000-x"); + } + + // -- Integration tests against the real iceberg-rust writer stack --------- + + mod integration { + use super::super::*; + use arrow::array::{Int32Array, StringArray}; + use datafusion::common::Result as DFResult; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use datafusion_comet_proto::spark_operator::{ + CompressionCodec as ProtoCodec, IcebergParquetWriteSettings, IcebergWriteCommon, + IcebergWriterMode as ProtoIcebergWriterMode, + }; + use iceberg::spec::{ + Manifest, NestedField, PartitionSpec, PrimitiveType, Schema, Transform, Type, + }; + use parquet::file::properties::WriterProperties; + use std::collections::HashMap; + use std::sync::Arc; + use tempfile::TempDir; + + fn user_schema() -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("region", DataType::Utf8, false), + ])) + } + + fn iceberg_user_schema() -> Schema { + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "region", Type::Primitive(PrimitiveType::String)) + .into(), + ]) + .build() + .unwrap() + } + + fn batch(ids: &[i32], regions: &[&str]) -> RecordBatch { + RecordBatch::try_new( + user_schema(), + vec![ + Arc::new(Int32Array::from(ids.to_vec())), + Arc::new(StringArray::from(regions.to_vec())), + ], + ) + .unwrap() + } + + fn input_stream(batches: Vec) -> SendableRecordBatchStream { + let schema = user_schema(); + Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches.into_iter().map(Ok::<_, DataFusionError>)), + )) + } + + fn common( + data_location: String, + spec_json: String, + schema_json: String, + writer_mode: ProtoIcebergWriterMode, + ) -> Arc { + let settings = IcebergParquetWriteSettings { + compression: ProtoCodec::Zstd as i32, + compression_level: Some(3), + row_group_size_bytes: 128 * 1024 * 1024, + page_size_bytes: 1024 * 1024, + dict_size_bytes: 2 * 1024 * 1024, + page_row_limit: 20_000, + default_statistics_enabled: true, + statistics_truncate_length: Some(16), + column_statistics: vec![], + created_by: "Apache Iceberg (Comet integration test)".to_string(), + }; + Arc::new(IcebergWriteCommon { + catalog_properties: HashMap::new(), + metadata_location: "file:/tmp/metadata.json".to_string(), + iceberg_schema_json: schema_json, + partition_spec_json: spec_json, + sort_order_id: 0, + data_location, + operation_id: "test-op".to_string(), + target_file_size_bytes: 512 * 1024 * 1024, + writer_mode: writer_mode as i32, + parquet_settings: Some(settings), + catalog_name: String::new(), + }) + } + + async fn run( + common: Arc, + schema: Schema, + spec: PartitionSpec, + writer_mode: ProtoIcebergWriterMode, + batches: Vec, + ) -> DFResult> { + run_write_task( + input_stream(batches), + common, + Arc::new(schema), + Arc::new(spec), + writer_mode, + WriterProperties::builder().build(), + Some(0), + Some(0), + ) + .await + } + + #[tokio::test] + async fn unpartitioned_write_emits_single_file_with_all_rows() { + let temp_dir = TempDir::new().unwrap(); + let data_location = format!("file://{}", temp_dir.path().display()); + let schema = iceberg_user_schema(); + let spec = PartitionSpec::builder(Arc::new(schema.clone())) + .build() + .unwrap(); + let common = common( + data_location.clone(), + serde_json::to_string(&spec).unwrap(), + serde_json::to_string(&schema).unwrap(), + ProtoIcebergWriterMode::IcebergWriterUnpartitioned, + ); + + let data_files = run( + common, + schema, + spec, + ProtoIcebergWriterMode::IcebergWriterUnpartitioned, + vec![batch(&[1, 2, 3], &["us", "eu", "us"])], + ) + .await + .unwrap(); + + assert_eq!(data_files.len(), 1); + assert_eq!(data_files[0].record_count(), 3); + assert!(data_files[0] + .file_path() + .contains(temp_dir.path().to_str().unwrap())); + assert!(data_files[0].file_path().ends_with(".parquet")); + } + + #[tokio::test] + async fn fanout_partitioned_write_produces_one_file_per_partition() { + let temp_dir = TempDir::new().unwrap(); + let data_location = format!("file://{}", temp_dir.path().display()); + let schema = iceberg_user_schema(); + let spec = PartitionSpec::builder(Arc::new(schema.clone())) + .with_spec_id(1) + .add_partition_field("region", "region", Transform::Identity) + .unwrap() + .build() + .unwrap(); + let common = common( + data_location, + serde_json::to_string(&spec).unwrap(), + serde_json::to_string(&schema).unwrap(), + ProtoIcebergWriterMode::IcebergWriterFanout, + ); + + let data_files = run( + common, + schema, + spec, + ProtoIcebergWriterMode::IcebergWriterFanout, + vec![batch(&[1, 2, 3, 4], &["us", "eu", "us", "eu"])], + ) + .await + .unwrap(); + + assert_eq!(data_files.len(), 2); + let total: u64 = data_files.iter().map(|f| f.record_count()).sum(); + assert_eq!(total, 4); + } + + #[tokio::test] + async fn clustered_partitioned_write_handles_sorted_input() { + let temp_dir = TempDir::new().unwrap(); + let data_location = format!("file://{}", temp_dir.path().display()); + let schema = iceberg_user_schema(); + let spec = PartitionSpec::builder(Arc::new(schema.clone())) + .with_spec_id(1) + .add_partition_field("region", "region", Transform::Identity) + .unwrap() + .build() + .unwrap(); + let common = common( + data_location, + serde_json::to_string(&spec).unwrap(), + serde_json::to_string(&schema).unwrap(), + ProtoIcebergWriterMode::IcebergWriterClustered, + ); + + // ClusteredWriter requires partition-sorted input. + let data_files = run( + common, + schema, + spec, + ProtoIcebergWriterMode::IcebergWriterClustered, + vec![batch(&[1, 2, 3, 4], &["eu", "eu", "us", "us"])], + ) + .await + .unwrap(); + + assert_eq!(data_files.len(), 2); + let total: u64 = data_files.iter().map(|f| f.record_count()).sum(); + assert_eq!(total, 4); + } + + #[tokio::test] + async fn encoded_manifest_round_trips_through_iceberg_parser() { + let temp_dir = TempDir::new().unwrap(); + let data_location = format!("file://{}", temp_dir.path().display()); + let schema = iceberg_user_schema(); + let spec = PartitionSpec::builder(Arc::new(schema.clone())) + .build() + .unwrap(); + let common = common( + data_location, + serde_json::to_string(&spec).unwrap(), + serde_json::to_string(&schema).unwrap(), + ProtoIcebergWriterMode::IcebergWriterUnpartitioned, + ); + + let schema_arc = Arc::new(schema); + let spec_arc = Arc::new(spec); + let data_files = run_write_task( + input_stream(vec![batch(&[10, 20], &["x", "y"])]), + Arc::clone(&common), + Arc::clone(&schema_arc), + Arc::clone(&spec_arc), + ProtoIcebergWriterMode::IcebergWriterUnpartitioned, + WriterProperties::builder().build(), + Some(0), + Some(0), + ) + .await + .unwrap(); + + let manifest_bytes = encode_data_files_as_manifest( + data_files.clone(), + Arc::clone(&schema_arc), + Arc::clone(&spec_arc), + Some(0), + Some(0), + &common.operation_id, + ) + .await + .unwrap(); + let output_schema = build_output_schema(); + let batch = build_output_batch(manifest_bytes.clone(), &output_schema).unwrap(); + assert_eq!(batch.num_rows(), 1); + + let manifest = Manifest::parse_avro(&manifest_bytes).unwrap(); + let entries = manifest.entries(); + assert_eq!(entries.len(), data_files.len()); + assert_eq!( + entries[0].data_file().record_count(), + data_files[0].record_count() + ); + assert_eq!( + entries[0].data_file().file_path(), + data_files[0].file_path() + ); + } + + #[tokio::test] + async fn decorate_batch_adds_field_ids() { + let schema = iceberg_user_schema(); + let target = Arc::new(iceberg::arrow::schema_to_arrow_schema(&schema).unwrap()); + let original = batch(&[1, 2], &["a", "b"]); + let decorated = decorate_batch_with_field_ids(original, &target).unwrap(); + let arrow_schema = decorated.schema(); + assert_eq!( + arrow_schema + .field(0) + .metadata() + .get(PARQUET_FIELD_ID_META_KEY), + Some(&"1".to_string()) + ); + assert_eq!( + arrow_schema + .field(1) + .metadata() + .get(PARQUET_FIELD_ID_META_KEY), + Some(&"2".to_string()) + ); + } + + #[tokio::test] + async fn decorate_batch_rejects_column_count_mismatch() { + let schema = iceberg_user_schema(); + let target = Arc::new(iceberg::arrow::schema_to_arrow_schema(&schema).unwrap()); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "unknown", + DataType::Int32, + false, + )])); + let batch = + RecordBatch::try_new(arrow_schema, vec![Arc::new(Int32Array::from(vec![1]))]) + .unwrap(); + let err = decorate_batch_with_field_ids(batch, &target).unwrap_err(); + assert!(format!("{err}").contains("column count mismatch")); + } + } +} diff --git a/native/core/src/execution/operators/mod.rs b/native/core/src/execution/operators/mod.rs index 4b2c06575d..d4ba8148cc 100644 --- a/native/core/src/execution/operators/mod.rs +++ b/native/core/src/execution/operators/mod.rs @@ -26,7 +26,10 @@ pub use scan::*; mod copy; mod expand; pub use expand::ExpandExec; +mod iceberg_common; mod iceberg_scan; +mod iceberg_write; +pub use iceberg_write::IcebergWriteExec; mod parquet_writer; pub use parquet_writer::ParquetWriterExec; mod csv_scan; diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index c6160bddd4..8dc209433f 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -23,6 +23,7 @@ pub mod operator_registry; use crate::execution::operators::init_csv_datasource_exec; use crate::execution::operators::IcebergScanExec; +use crate::execution::operators::IcebergWriteExec; use crate::execution::{ expressions::list_positions::ListPositionsExpr, expressions::subquery::Subquery, @@ -1554,6 +1555,24 @@ impl PhysicalPlanner { )), )) } + OpStruct::IcebergWrite(iceberg_write) => { + assert_eq!(children.len(), 1); + let (scans, shuffle_scans, child) = + self.create_plan(&children[0], inputs, partition_count)?; + let exec = Arc::new(IcebergWriteExec::try_new( + Arc::clone(&child.native_plan), + iceberg_write.clone(), + )?); + Ok(( + scans, + shuffle_scans, + Arc::new(SparkPlan::new( + spark_plan.plan_id, + exec, + vec![Arc::clone(&child)], + )), + )) + } OpStruct::ParquetWriter(writer) => { assert_eq!(children.len(), 1); let (scans, shuffle_scans, child) = diff --git a/native/core/src/execution/planner/operator_registry.rs b/native/core/src/execution/planner/operator_registry.rs index eb31184461..4937cddff9 100644 --- a/native/core/src/execution/planner/operator_registry.rs +++ b/native/core/src/execution/planner/operator_registry.rs @@ -46,6 +46,7 @@ pub enum OperatorType { Scan, NativeScan, IcebergScan, + IcebergWrite, Projection, Filter, HashAgg, @@ -142,6 +143,7 @@ fn get_operator_type(spark_operator: &Operator) -> Option { OpStruct::Scan(_) => Some(OperatorType::Scan), OpStruct::NativeScan(_) => Some(OperatorType::NativeScan), OpStruct::IcebergScan(_) => Some(OperatorType::IcebergScan), + OpStruct::IcebergWrite(_) => Some(OperatorType::IcebergWrite), OpStruct::ShuffleWriter(_) => Some(OperatorType::ShuffleWriter), OpStruct::ParquetWriter(_) => Some(OperatorType::ParquetWriter), OpStruct::Expand(_) => Some(OperatorType::Expand), diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 7f50aa928c..c1d2f9c51b 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -53,6 +53,7 @@ message Operator { Explode explode = 114; CsvScan csv_scan = 115; ShuffleScan shuffle_scan = 116; + IcebergWrite iceberg_write = 117; } } @@ -277,6 +278,146 @@ message IcebergDeleteFile { uint64 file_size_in_bytes = 5; } +// === Iceberg writes ========================================================= +// +// Mirrors the read-side split: `IcebergWriteCommon` is the per-write payload broadcast once to +// every task; `IcebergWrite` wraps the common payload and adds per-task fields filled in at +// execution time (partitionId/taskAttemptId). The rust side translates `IcebergWriteCommon` and +// `IcebergParquetWriteSettings` into the iceberg-rust writer stack +// (`ParquetWriterBuilder` -> `RollingFileWriterBuilder` -> `DataFileWriterBuilder` -> partitioner) +// and returns one `IcebergFileWriteResult` per file the rolling writer closes. + +// Picks which iceberg-rust writer the rust side instantiates. Resolved once on the JVM driver +// (via `spec.isUnpartitioned()` and `SparkWriteConf.useFanoutWriter`) so the rust side never +// re-derives the choice. +enum IcebergWriterMode { + ICEBERG_WRITER_UNPARTITIONED = 0; + ICEBERG_WRITER_FANOUT = 1; + ICEBERG_WRITER_CLUSTERED = 2; +} + +// Per-column statistics override. Resolved from `write.metadata.metrics.column.` and +// emitted only when the value differs from the table-level default. +// +// - `enabled = false` => column's stats are turned off entirely (Iceberg `none` mode). +// - `truncate_length` => Iceberg `truncate(N)` mode for this column; absent means `full`. +message IcebergParquetColumnStatistics { + string column = 1; + bool enabled = 2; + optional uint32 truncate_length = 3; +} + +// Parquet-rs `WriterProperties` knobs derived from Iceberg table properties on the JVM driver. +// All fields use Iceberg defaults when absent in the table properties; the rust side does not +// re-apply defaults. +message IcebergParquetWriteSettings { + CompressionCodec compression = 1; + // Codec-dependent: zstd 1-22, gzip 0-9, brotli 0-11. JVM side substitutes parquet-mr's + // per-codec default when the property is unset and parquet-rs's default would diverge + // (zstd: 3 instead of parquet-rs's 1). Absent => use parquet-rs's natural default. + optional int32 compression_level = 2; + // Iceberg `write.parquet.row-group-size-bytes` (default 128 MiB). + uint64 row_group_size_bytes = 3; + // Iceberg `write.parquet.page-size-bytes` (default 1 MiB). + uint64 page_size_bytes = 4; + // Iceberg `write.parquet.dict-size-bytes` (default 2 MiB). + uint64 dict_size_bytes = 5; + // Iceberg `write.parquet.page-row-limit` (default 20_000). + uint32 page_row_limit = 6; + // Table-level statistics on/off. Driven by `write.metadata.metrics.default = none`, which + // disables all stats; any other value leaves it true. `counts` is filtered out by the + // detection rule earlier in the pipeline. + bool default_statistics_enabled = 7; + // Resolved from `write.metadata.metrics.default = truncate(N)`. Absent means "full" (no + // truncation). + optional uint32 statistics_truncate_length = 8; + repeated IcebergParquetColumnStatistics column_statistics = 9; + // String written into parquet file metadata. JVM-side default is + // `"Apache Iceberg (Comet )"`. + string created_by = 10; +} + +// Broadcast payload -- one of these per write, identical for every task. +message IcebergWriteCommon { + // Catalog properties for FileIO (same shape as `IcebergScanCommon.catalog_properties`). + map catalog_properties = 1; + + // Iceberg table metadata file path. Kept on the wire for plan-debug visibility (shows up in + // `IcebergWriteExec`'s `DisplayAs`); `FileIO` itself is initialised from `data_location` + // since both paths resolve to the same storage scheme. + string metadata_location = 2; + + // Iceberg `SchemaParser.toJson(table.schema())`. Carries the schema id inside the JSON; the + // rust side parses it via `serde_json::from_str::`. + string iceberg_schema_json = 3; + + // Iceberg `PartitionSpecParser.toJson(table.spec())`. Same wire format as one entry in + // `IcebergScanCommon.partition_spec_pool`. Carries the spec id inside the JSON. + string partition_spec_json = 4; + + // Iceberg `SparkWriteConf.outputSortOrderId(...)`. Not embedded in any of the JSON payloads + // and stamped onto each output `DataFile`, so it ships as its own field. + int32 sort_order_id = 5; + + // Directory under which data files are written; resolved on the JVM via the table's + // `LocationProvider` plus any `write.data.path` override. + string data_location = 6; + + // Iceberg `OutputFileFactory` operation UUID -- embedded in filenames so files from the same + // logical write are correlated across tasks. + string operation_id = 7; + + // Target file size in bytes; the rust rolling writer rolls files at this threshold. + uint64 target_file_size_bytes = 8; + + IcebergWriterMode writer_mode = 9; + + IcebergParquetWriteSettings parquet_settings = 10; + + // Spark V2 catalog name that loaded this table. Forwarded as the dispatchKey to + // CometS3CredentialDispatcher.ensureInitialized (and the access intent to the credential + // provider) so two catalogs sharing one provider class get isolated provider instances. Empty + // string when the table has no catalog identity (e.g. HadoopTables loaded by raw path). Mirrors + // `IcebergScanCommon.catalog_name`. + string catalog_name = 11; +} + +// Single Iceberg write operator. Per-task fields are populated by the JVM exec wrapper at task +// launch (matching how `ParquetWriter.task_attempt_id` is filled in `CometNativeWriteExec`). +message IcebergWrite { + IcebergWriteCommon common = 1; + + // Spark `TaskContext.get().partitionId()` -- 0-padded into the filename prefix + // `{partition_id:05d}-{task_attempt_id}-{operation_id}`. + optional int32 partition_id = 2; + + // Spark `TaskContext.get().taskAttemptId()`. + optional int64 task_attempt_id = 3; +} + +// One row in the JVM-side commit-message aggregation: returned by the rust writer for each file +// the rolling writer closes. The JVM passes `parquet_footer_thrift` to +// `ParquetUtil.footerMetrics(...)` (which honours `MetricsConfig` truncation automatically), +// then assembles `DataFile`s from these plus `partition_data` and commits via Iceberg's +// `Transaction`. +message IcebergFileWriteResult { + // Path relative to `IcebergWriteCommon.data_location` so the catalog stores stable paths even + // if the data directory is later remapped. + string relative_path = 1; + + uint64 record_count = 2; + uint64 file_size_in_bytes = 3; + + // Raw thrift bytes of the parquet `FileMetaData`. Iceberg-java's + // `ParquetUtil.footerMetrics(metadata, fieldMetrics, metricsConfig, nameMapping)` consumes + // this and produces an Iceberg `Metrics` (bounds, null counts, value counts, column sizes, + // split offsets). + bytes parquet_footer_thrift = 4; + + // Same `PartitionData` wire format used by the read-side scan pool entries. + PartitionData partition_data = 5; +} + message Projection { repeated spark.spark_expression.Expr project_list = 1; } @@ -315,6 +456,10 @@ enum CompressionCodec { Zstd = 1; Lz4 = 2; Snappy = 3; + // Gzip and Brotli are added for Iceberg writes which inherit Parquet's full codec set. + // The pre-existing ShuffleWriter and ParquetWriter paths do not exercise these values. + Gzip = 4; + Brotli = 5; } message ShuffleWriter { diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 78ea0f0168..3d916d99a1 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -121,6 +121,36 @@ object CometConf extends ShimCometConf { .booleanConf .createWithDefault(true) + val COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED: ConfigEntry[Boolean] = + conf("spark.comet.write.iceberg.splitOperator.enabled") + .category(CATEGORY_TESTING) + .doc( + "Whether to rewrite Iceberg V2 writes from Spark's combined V2 write/commit operator " + + "into Comet's two-operator shape: a separate inner write exec (inside AQE) and outer " + + "commit exec (outside AQE). This lets Comet convert the data sub-query natively " + + "without affecting Iceberg's commit semantics. Required gate for native parquet " + + "writes (`spark.comet.write.iceberg.nativeAcceleration`). Off by default; Iceberg-Java " + + "writes the data unchanged when the gate is off. Highly experimental.") + .booleanConf + .createWithDefault(false) + + val COMET_ICEBERG_NATIVE_WRITE_ENABLED: ConfigEntry[Boolean] = + conf("spark.comet.write.iceberg.nativeAcceleration") + .category(CATEGORY_TESTING) + .doc( + "Whether to delegate the executor-side Parquet write to Comet's native (iceberg-rust) " + + "writer when the table's properties allow it. Requires " + + "`spark.comet.write.iceberg.splitOperator.enabled = true`. Off by default. " + + "Supported on Spark 3.4 (with Iceberg 1.5.2) and Spark 3.5 / 4.0 (with Iceberg 1.8+). " + + "Falls back to the JVM two-op path automatically for unsupported scenarios " + + "(MERGE, encrypted / custom-location-provider tables, non-parquet format, " + + "format-version >= 3, counts-only metrics, bloom filters, non-default parquet " + + "row-group cadence, schemas wider than " + + "`write.metadata.metrics.max-inferred-column-defaults`). See " + + "`docs/source/user-guide/latest/iceberg-writes.md` for the full trigger table.") + .booleanConf + .createWithDefault(false) + val COMET_ICEBERG_DATA_FILE_CONCURRENCY_LIMIT: ConfigEntry[Int] = conf("spark.comet.scan.icebergNative.dataFileConcurrencyLimit") .category(CATEGORY_SCAN) diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala index 6c4a92f312..fe7c09af27 100644 --- a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala +++ b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.execution._ import org.apache.spark.sql.internal.SQLConf import org.apache.comet.CometConf._ +import org.apache.comet.iceberg.IcebergWriteStrategy import org.apache.comet.rules.{CometExecRule, CometPlanAdaptiveDynamicPruningFilters, CometReuseSubquery, CometScanRule, CometSpark34AqeDppFallbackRule, EliminateRedundantTransitions} import org.apache.comet.shims.ShimCometSparkSessionExtensions @@ -97,6 +98,9 @@ class CometSparkSessionExtensions extensions.injectQueryStagePrepRule { session => CometExecRule(session) } injectQueryStageOptimizerRuleShim(extensions, CometPlanAdaptiveDynamicPruningFilters) injectQueryStageOptimizerRuleShim(extensions, CometReuseSubquery) + // Runs before Spark's DataSourceV2Strategy because `experimentalMethods.extraStrategies` is + // prepended to the planner's strategy list. + extensions.injectPlannerStrategy { session => IcebergWriteStrategy(session) } } case class CometScanColumnar(session: SparkSession) extends ColumnarRule { diff --git a/spark/src/main/scala/org/apache/comet/iceberg/IcebergReflection.scala b/spark/src/main/scala/org/apache/comet/iceberg/IcebergReflection.scala index 2a39369c76..51bcd1d884 100644 --- a/spark/src/main/scala/org/apache/comet/iceberg/IcebergReflection.scala +++ b/spark/src/main/scala/org/apache/comet/iceberg/IcebergReflection.scala @@ -51,6 +51,24 @@ object IcebergReflection extends Logging { val UNBOUND_PREDICATE = "org.apache.iceberg.expressions.UnboundPredicate" val SPARK_BATCH_QUERY_SCAN = "org.apache.iceberg.spark.source.SparkBatchQueryScan" val SPARK_STAGED_SCAN = "org.apache.iceberg.spark.source.SparkStagedScan" + val SPARK_WRITE = "org.apache.iceberg.spark.source.SparkWrite" + val TABLE_PROPERTIES = "org.apache.iceberg.TableProperties" + val TYPE_UTIL = "org.apache.iceberg.types.TypeUtil" + val INMEMORY_INPUT_FILE = "org.apache.iceberg.inmemory.InMemoryInputFile" + val INMEMORY_FILE_IO = "org.apache.iceberg.inmemory.InMemoryFileIO" + val INPUT_FILE = "org.apache.iceberg.io.InputFile" + val FILE_IO = "org.apache.iceberg.io.FileIO" + val GENERIC_MANIFEST_FILE = "org.apache.iceberg.GenericManifestFile" + val MANIFEST_FILE = "org.apache.iceberg.ManifestFile" + val MANIFEST_FILES = "org.apache.iceberg.ManifestFiles" + val DATA_FILE = "org.apache.iceberg.DataFile" + + // Iceberg 1.5.2 (Spark 3.4 profile) ships its own `ReplaceIcebergData` logical write node via + // the iceberg-spark-extensions module instead of using Spark's stock `ReplaceData`. Iceberg + // 1.8+ dropped it in favour of Spark 3.5's native row-level operation support, so this class + // name will only resolve on the 3.4 + Iceberg 1.5.2 combo (where SQL UPDATE / MERGE on V2 + // tables require the Iceberg extension). + val REPLACE_ICEBERG_DATA = "org.apache.spark.sql.catalyst.plans.logical.ReplaceIcebergData" } /** @@ -94,6 +112,101 @@ object IcebergReflection extends Logging { val UNKNOWN = "unknown" } + /** + * Probes a class via reflection once, caches the outcome. Returns `None` when Iceberg is not on + * the classpath so Comet stays buildable in non-Iceberg deployments. + */ + private def tryLoadClass(name: String): Option[Class[_]] = + try Some(loadClass(name)) + catch { case _: ClassNotFoundException => None } + + private lazy val sparkWriteClassOpt: Option[Class[_]] = tryLoadClass(ClassNames.SPARK_WRITE) + + /** + * Whether `write` is an Iceberg `SparkWrite` (or subclass). Returns false if Iceberg is not on + * the classpath. Used by the write strategy to decide whether to intercept a V2 logical write. + */ + def isIcebergSparkWrite(write: Any): Boolean = + sparkWriteClassOpt.exists(_.isInstance(write)) + + /** + * Whether `batchWrite` is one of Iceberg's `SparkWrite` inner `BatchWrite` impls + * (`BatchAppend`, `OverwriteByFilter`, `DynamicOverwrite`, `CopyOnWriteOperation`, + * `RewriteFiles`). These are private inner classes of `SparkWrite`; we detect them by name + * prefix because they are not exposed as a public superclass. + */ + def isIcebergBatchWrite(batchWrite: Any): Boolean = { + if (batchWrite == null) return false + batchWrite.getClass.getName.startsWith(ClassNames.SPARK_WRITE + "$") + } + + /** + * Extract the outer `SparkWrite` instance from an Iceberg `BatchWrite` inner class via the + * synthetic `this$0` reference Java compilers emit for non-static inner classes. The `Table` + * and table-property accessors live on the outer class. + */ + def getOuterSparkWrite(batchWrite: Any): Option[Any] = { + if (batchWrite == null) None + else { + try { + val field = batchWrite.getClass.getDeclaredField("this$0") + field.setAccessible(true) + Option(field.get(batchWrite)) + } catch { + case _: NoSuchFieldException => + // batchWrite may already be the outer class (static inner) -- fall back to itself. + Some(batchWrite) + case e: Exception => + logError( + s"Iceberg reflection failure: outer SparkWrite from BatchWrite: ${e.getMessage}") + None + } + } + } + + /** + * Whether `plan` is Iceberg's `ReplaceIcebergData` logical node (Iceberg 1.5.2's UPDATE / MERGE + * rewrite target on Spark 3.4). Matched by FQCN so the main module doesn't compile-depend on + * iceberg-spark-extensions. + */ + def isReplaceIcebergData(plan: Any): Boolean = + plan != null && plan.getClass.getName == ClassNames.REPLACE_ICEBERG_DATA + + /** + * Read a declared (or inherited) field on `plan` reflectively. Used for extracting fields off + * Iceberg 1.5.2's `ReplaceIcebergData` logical node when iceberg-spark-extensions isn't on the + * main classpath. + */ + private def reflectField(plan: Any, fieldName: String): Option[AnyRef] = + try { + val field = plan.getClass.getDeclaredField(fieldName) + field.setAccessible(true) + Option(field.get(plan)) + } catch { + case e: Exception => + logError( + s"Iceberg reflection failure: $fieldName on ${plan.getClass.getName}: ${e.getMessage}") + None + } + + /** + * Pull `(table, query, originalTable, write)` out of a `ReplaceIcebergData` instance. Iceberg's + * 1.5.2 case class shape matches Spark 3.5+'s stock `ReplaceData` exactly, so the extracted + * tuple can feed our `matchedSparkWrite` dispatcher unchanged. + */ + def extractReplaceIcebergDataFields(plan: Any): Option[(AnyRef, AnyRef, AnyRef, AnyRef)] = { + if (!isReplaceIcebergData(plan)) return None + for { + table <- reflectField(plan, "table") + query <- reflectField(plan, "query") + originalTable <- reflectField(plan, "originalTable") + write <- reflectField( + plan, + "write" + ) // Option[Write]; field can be Some(null) so kept AnyRef + } yield (table, query, originalTable, write) + } + /** * Loads a class using the thread context classloader first, then falls back to the system * classloader. @@ -639,6 +752,534 @@ object IcebergReflection extends Logging { unsupportedTypes.toList } + + /** + * Reads a `private final` field from a `SparkWrite` instance via reflection. The field names + * referenced by callers (`queryId`, `targetFileSize`, `useFanoutWriter`, `outputSpecId`, + * `writeSchema`) are present on `SparkWrite` across the Iceberg 1.5.2 / 1.8.1 / 1.10.0 versions + * Comet supports. + */ + private def getSparkWriteField(sparkWrite: Any, fieldName: String): Option[Any] = + sparkWriteClassOpt.flatMap { cls => + try { + val field = cls.getDeclaredField(fieldName) + field.setAccessible(true) + Option(field.get(sparkWrite)) + } catch { + case _: NoSuchFieldException => + // Field may have been renamed across Iceberg versions. Callers that probe multiple + // candidate names (e.g. `useFanoutWriter` / `partitionedFanoutEnabled`) should expect + // this and `.orElse` onto the alternative. + None + case e: Exception => + logError( + s"Iceberg reflection failure: Failed to read SparkWrite.$fieldName: ${e.getMessage}") + None + } + } + + /** Operation id used in data-file names; sourced from `SparkWrite.queryId`. */ + def getOperationIdFromSparkWrite(sparkWrite: Any): Option[String] = + getSparkWriteField(sparkWrite, "queryId").map(_.asInstanceOf[String]) + + /** Target data file size in bytes; sourced from `SparkWrite.targetFileSize`. */ + def getTargetFileSizeFromSparkWrite(sparkWrite: Any): Option[Long] = + getSparkWriteField(sparkWrite, "targetFileSize") + .map(_.asInstanceOf[java.lang.Long].longValue()) + + /** + * Whether the planner would use a fanout writer. + * + * Field name changed between Iceberg releases: + * - 1.5.2 (Spark 3.4 profile): `partitionedFanoutEnabled` + * - 1.8.1+ (Spark 3.5 / 4.0 profiles): `useFanoutWriter` + * + * Same semantic in both versions. Try the newer name first; fall back to the older one so the + * helper resolves across all supported Iceberg versions without a per-version shim. + */ + def getUseFanoutWriterFromSparkWrite(sparkWrite: Any): Option[Boolean] = + getSparkWriteField(sparkWrite, "useFanoutWriter") + .orElse(getSparkWriteField(sparkWrite, "partitionedFanoutEnabled")) + .map(_.asInstanceOf[java.lang.Boolean].booleanValue()) + + /** Output partition spec id; sourced from `SparkWrite.outputSpecId`. */ + def getOutputSpecIdFromSparkWrite(sparkWrite: Any): Option[Int] = + getSparkWriteField(sparkWrite, "outputSpecId") + .map(_.asInstanceOf[java.lang.Integer].intValue()) + + /** Iceberg `Schema` the write was planned against; sourced from `SparkWrite.writeSchema`. */ + def getWriteSchemaFromSparkWrite(sparkWrite: Any): Option[Any] = + getSparkWriteField(sparkWrite, "writeSchema") + + /** + * Effective output file format resolved by Iceberg via `SparkWriteConf.dataFileFormat()`. Java + * consults the `write-format` write option BEFORE the `write.format.default` table property, so + * a per-write option override must win - gating only on table properties produces false-pass + * and false-fall-back outcomes when the two disagree. + * + * `SparkWrite.format` is a `FileFormat` enum (`PARQUET`/`ORC`/`AVRO`); returned lower-cased. + */ + def getFormatFromSparkWrite(sparkWrite: Any): Option[String] = + getSparkWriteField(sparkWrite, "format") + .map(_.toString.toLowerCase(java.util.Locale.ROOT)) + + /** + * Extracts the Spark V2 catalog name from an Iceberg `Table`. `Table.name()` returns + * `catalog.namespace.table` for tables loaded through a catalog; we intersect against the + * registered V2 catalogs so a value like `s3.foo` is not mistaken for a catalog `s3`. Returns + * `None` for HadoopTables loaded by raw path or when reflection fails. Shared by the scan and + * write paths so both derive the credential dispatch key identically. + */ + def deriveCatalogName(table: Any): Option[String] = + deriveCatalogName(table, registeredCatalogNames _) + + /** + * Test seam that lets tests inject a fixed catalog set without bootstrapping a SparkSession. + */ + private[iceberg] def deriveCatalogName( + table: Any, + knownCatalogNames: () => Iterable[String]): Option[String] = { + if (table == null) return None + invokeTableName(table).flatMap { name => + if (name.isEmpty || name == "null") { + None + } else { + knownCatalogNames() + .find(c => name == c || name.startsWith(c + ".")) + .orElse { + val idx = name.indexOf('.') + if (idx > 0) Some(name.substring(0, idx)) else None + } + } + } + } + + private def invokeTableName(table: Any): Option[String] = { + try { + table.getClass.getMethod("name").invoke(table) match { + case s: String => Some(s) + case other if other != null => Some(other.toString) + case null => None + } + } catch { + case e: Exception => + logWarning( + s"Iceberg reflection: Table.name() not callable on ${table.getClass.getName}. " + + "Native S3 credential dispatch will fall back to bucket-keyed isolation: " + + s"${e.getMessage}") + None + } + } + + private def registeredCatalogNames(): Iterable[String] = + try { + SparkSession.active.sessionState.catalogManager.listCatalogs(None) + } catch { + case e: Exception => + logDebug(s"Could not list V2 catalogs from SparkSession: ${e.getMessage}") + Nil + } + + /** + * Resolved write properties from `SparkWriteConf.writeProperties()`, snapshotted on the + * `SparkWrite` at planning time. Java's `RegistryBasedFileWriterFactory` merges these over the + * table's properties before constructing the per-file writer; build-time native settings must + * mirror that merge or per-write options (codec / level / variant-shredding / ...) silently + * lose their effect. + */ + def getWritePropertiesFromSparkWrite(sparkWrite: Any): Option[Map[String, String]] = + getSparkWriteField(sparkWrite, "writeProperties").map { raw => + val javaMap = raw.asInstanceOf[java.util.Map[String, String]] + val iter = javaMap.entrySet().iterator() + val builder = Map.newBuilder[String, String] + while (iter.hasNext) { + val entry = iter.next() + builder += (entry.getKey -> entry.getValue) + } + builder.result() + } + + /** + * Output sort order id resolved by Iceberg's + * `SparkWriteConf.outputSortOrderId(writeRequirements)`: a per-write `output-sort-order-id` + * option wins, else the table's sort order when an ordering is required, else `0` (unsorted). + * Stamping the table sort order id unconditionally (the previous behaviour) writes the wrong + * value when Java would have used `unsorted`. + * + * Iceberg 1.5.2 (Spark 3.4 profile) lacks this method on `SparkWriteConf`; we return `None` and + * let the caller fall back to `Table.sortOrder().orderId()`. + */ + def getOutputSortOrderIdFromSparkWrite(sparkWrite: Any): Option[Int] = { + val writeConf = + getSparkWriteField(sparkWrite, "writeConf").map(_.asInstanceOf[AnyRef]).getOrElse { + return None + } + val writeRequirements = + getSparkWriteField(sparkWrite, "writeRequirements").map(_.asInstanceOf[AnyRef]).getOrElse { + return None + } + try { + val method = writeConf.getClass.getDeclaredMethods + .find(m => m.getName == "outputSortOrderId" && m.getParameterCount == 1) + .getOrElse(return None) + method.setAccessible(true) + val result = method.invoke(writeConf, writeRequirements) + Some(result.asInstanceOf[java.lang.Integer].intValue()) + } catch { + case e: Exception => + logError( + "Iceberg reflection failure: SparkWriteConf.outputSortOrderId failed " + + s"(${e.getMessage}); falling back to table.sortOrder().orderId()") + None + } + } + + /** + * Reads the `private final` `table` field from a `SparkWrite`. Same setAccessible-required + * pattern as the other private-final accessors. + */ + def getTableFromSparkWrite(sparkWrite: Any): Option[Any] = + getSparkWriteField(sparkWrite, "table") + + /** + * Looks up a `PartitionSpec` from `Table.specs()` by its id. Used to retrieve the spec the + * write was planned against (`outputSpecId`), which may differ from the table's current spec + * for evolution scenarios. + */ + def getPartitionSpecById(table: Any, specId: Int): Option[Any] = + try { + val method = table.getClass.getMethod("specs") + val specs = method.invoke(table).asInstanceOf[java.util.Map[java.lang.Integer, _]] + Option(specs.get(java.lang.Integer.valueOf(specId))) + } catch { + case e: Exception => + logError( + "Iceberg reflection failure: Failed to look up partition spec " + + s"$specId: ${e.getMessage}") + None + } + + /** + * Base directory under which the writer should place data files. Resolved by invoking + * `table.locationProvider().newDataLocation("")` and trimming the trailing slash, matching the + * data location Iceberg-Java itself uses for `OutputFileFactory`. + */ + def getDataLocation(table: Any): Option[String] = + try { + val locationProviderMethod = + findMethodInHierarchy(table.getClass, "locationProvider").getOrElse( + throw new NoSuchMethodException( + s"locationProvider() not found on ${table.getClass.getName}")) + val provider = locationProviderMethod.invoke(table) + // `LocationProviders$DefaultLocationProvider` is package-private with a public + // `newDataLocation(String)`. Without `setAccessible(true)` JDK 11+ rejects the invoke. + val newDataLocMethod = provider.getClass.getMethod("newDataLocation", classOf[String]) + newDataLocMethod.setAccessible(true) + val location = newDataLocMethod.invoke(provider, "").asInstanceOf[String] + Some(location.stripSuffix("/")) + } catch { + case e: Exception => + logError(s"Iceberg reflection failure: Failed to get data location: ${e.getMessage}", e) + None + } + + /** Renders an Iceberg `Schema` to its JSON wire format via `SchemaParser.toJson(Schema)`. */ + def schemaToJson(schema: Any): Option[String] = + try { + val parserClass = loadClass(ClassNames.SCHEMA_PARSER) + val schemaClass = loadClass(ClassNames.SCHEMA) + val method = parserClass.getMethod("toJson", schemaClass) + Some(method.invoke(null, schema.asInstanceOf[AnyRef]).asInstanceOf[String]) + } catch { + case e: Exception => + logError(s"Iceberg reflection failure: SchemaParser.toJson: ${e.getMessage}") + None + } + + /** + * Renders an Iceberg `PartitionSpec` to its JSON wire format via + * `PartitionSpecParser.toJson(PartitionSpec)`. + */ + def partitionSpecToJson(spec: Any): Option[String] = + try { + val parserClass = loadClass(ClassNames.PARTITION_SPEC_PARSER) + val specClass = loadClass(ClassNames.PARTITION_SPEC) + val method = parserClass.getMethod("toJson", specClass) + Some(method.invoke(null, spec.asInstanceOf[AnyRef]).asInstanceOf[String]) + } catch { + case e: Exception => + logError(s"Iceberg reflection failure: PartitionSpecParser.toJson: ${e.getMessage}") + None + } + + private lazy val tablePropertiesClassOpt: Option[Class[_]] = + tryLoadClass(ClassNames.TABLE_PROPERTIES) + + /** + * Reads a static `String` constant from Iceberg's `org.apache.iceberg.TableProperties` by field + * name (e.g. `"DEFAULT_FILE_FORMAT"` -> `"write.format.default"`). Throws when Iceberg is + * absent or the constant has been renamed/removed; both indicate a version we have not vetted. + */ + def tablePropertyConstant(fieldName: String): String = + readTablePropertiesField(fieldName).asInstanceOf[String] + + /** + * Reads a static `int` constant from `TableProperties` (e.g. one of the `*_DEFAULT` numerics). + */ + def tablePropertyIntConstant(fieldName: String): Int = + readTablePropertiesField(fieldName).asInstanceOf[Integer].intValue() + + /** + * Like [[tablePropertyConstant]] but returns `None` when the constant is absent in the Iceberg + * version on the classpath rather than throwing. Used to gate behaviour that only some Iceberg + * versions implement -- e.g. `PARQUET_COLUMN_STATS_ENABLED_PREFIX`, added in 1.10.0; on 1.5.2 / + * 1.8.1 the corresponding property is silently ignored by Iceberg-Java, so there is nothing to + * gate. + */ + def tablePropertyConstantOpt(fieldName: String): Option[String] = + tablePropertiesClassOpt.flatMap { cls => + try Some(cls.getField(fieldName).get(null).asInstanceOf[String]) + catch { case _: NoSuchFieldException => None } + } + + private def readTablePropertiesField(fieldName: String): Any = { + val cls = tablePropertiesClassOpt.getOrElse( + throw new IllegalStateException(s"${ClassNames.TABLE_PROPERTIES} is not on the classpath")) + try cls.getField(fieldName).get(null) + catch { + case e: NoSuchFieldException => + throw new IllegalStateException( + s"${ClassNames.TABLE_PROPERTIES}.$fieldName not found " + + "(unsupported Iceberg version?)", + e) + } + } + + /** + * Returns the top-level column names of an Iceberg `Schema`, in declared order. Used by the + * native write serde to project Spark 4.x `ReplaceData` row streams (which carry an + * `__row_operation` column plus optional file-metadata columns) down to just the data columns + * the native iceberg-rust writer expects. + */ + def getSchemaFieldNames(schema: Any): Option[Seq[String]] = + try { + val cols = schema.getClass + .getMethod("columns") + .invoke(schema) + .asInstanceOf[java.util.List[_]] + val names = new scala.collection.mutable.ArrayBuffer[String](cols.size()) + val it = cols.iterator() + while (it.hasNext) { + val col = it.next().asInstanceOf[AnyRef] + names += col.getClass.getMethod("name").invoke(col).asInstanceOf[String] + } + Some(names.toSeq) + } catch { + case e: Exception => + logError(s"Iceberg reflection failure: Schema.columns(): ${e.getMessage}") + None + } + + /** + * Returns the count of projected field IDs in `schema` -- mirrors Iceberg-Java's + * `TypeUtil.getProjectedIds(Schema).size()` used to gate + * `write.metadata.metrics.max-inferred-column-defaults`. + */ + def getProjectedFieldIdCount(schema: Any): Option[Int] = { + for { + typeUtilClass <- tryLoadClass(ClassNames.TYPE_UTIL) + schemaClass <- tryLoadClass(ClassNames.SCHEMA) + } yield { + val method = typeUtilClass.getMethod("getProjectedIds", schemaClass) + val ids = method.invoke(null, schema.asInstanceOf[AnyRef]).asInstanceOf[java.util.Set[_]] + ids.size() + } + } + + /** + * Sum `recordCount` and `fileSizeInBytes` across `dataFiles` for SQL-metric reporting. The + * concrete `DataFile` impl (`BaseFile`) is package-private in Iceberg, so look the accessors up + * on the public `DataFile` interface instead; virtual dispatch still hits the concrete + * implementation at invoke time. + */ + def sumDataFileMetrics(dataFiles: java.util.List[_]): (Long, Long) = { + if (dataFiles.isEmpty) return (0L, 0L) + val dataFileClass = loadClass(ClassNames.DATA_FILE) + val recordCountMethod = dataFileClass.getMethod("recordCount") + val fileSizeMethod = dataFileClass.getMethod("fileSizeInBytes") + var rows = 0L + var bytes = 0L + val it = dataFiles.iterator() + while (it.hasNext) { + val df = it.next().asInstanceOf[AnyRef] + rows += recordCountMethod.invoke(df).asInstanceOf[java.lang.Long].longValue() + bytes += fileSizeMethod.invoke(df).asInstanceOf[java.lang.Long].longValue() + } + (rows, bytes) + } + + /** + * Stamp `sortOrderId` on every `DataFile` in `dataFiles`. iceberg-rust's writer doesn't expose + * the field (it's `pub(crate)` on `DataFile`), so the manifest comes back with sort_order_id + * unset. iceberg-java's `BaseFile` declares a private `sortOrderId: Integer` field that the + * normal `SparkWrite` path populates from the table's `outputSortOrderId`; we mirror that here + * by writing the same value via reflection before handing files to the committer. + */ + def stampSortOrderId(dataFiles: java.util.List[_], sortOrderId: Int): Unit = { + if (dataFiles.isEmpty) return + val baseFileClass = loadClass("org.apache.iceberg.BaseFile") + val field = baseFileClass.getDeclaredField("sortOrderId") + field.setAccessible(true) + val boxed = Integer.valueOf(sortOrderId) + val it = dataFiles.iterator() + while (it.hasNext) { + field.set(it.next().asInstanceOf[AnyRef], boxed) + } + } + + /** + * Construct a `SparkWrite$TaskCommit(DataFile[])` instance for the native commit path. The + * constructor is package-private; `setAccessible(true)` is required on every Iceberg version. + */ + def buildTaskCommit(dataFiles: java.util.List[_]): AnyRef = { + val taskCommitClass = loadClass("org.apache.iceberg.spark.source.SparkWrite$TaskCommit") + val dataFileClass = loadClass("org.apache.iceberg.DataFile") + val arrayClass = java.lang.reflect.Array.newInstance(dataFileClass, 0).getClass + val ctor = taskCommitClass.getDeclaredConstructor(arrayClass) + ctor.setAccessible(true) + val array = java.lang.reflect.Array.newInstance(dataFileClass, dataFiles.size()) + for (i <- 0 until dataFiles.size()) { + java.lang.reflect.Array.set(array, i, dataFiles.get(i)) + } + ctor.newInstance(array.asInstanceOf[AnyRef]).asInstanceOf[AnyRef] + } + + /** + * Returns the runtime Iceberg version (`IcebergBuild.version()`) when available. Used to stamp + * the Parquet `created_by` field so files written natively can be traced back to the Iceberg + * release whose property defaults Comet mirrored. + */ + def icebergVersion(): String = + try { + val cls = loadClass("org.apache.iceberg.IcebergBuild") + cls.getMethod("loadBuildInfo").invoke(null) + cls.getMethod("version").invoke(null).asInstanceOf[String] + } catch { + case e: Exception => + logWarning(s"Iceberg reflection failure: IcebergBuild.version: ${e.getMessage}") + "unknown" + } + + /** + * Decode the per-task manifest bytes the native Iceberg writer emits into an `Iterable` of + * Iceberg `DataFile` snapshots. The native operator writes one V2 data manifest per task + * (Avro-encoded) via iceberg-rust's `ManifestWriter`; this helper builds an + * `InMemoryFileIO`/`InMemoryInputFile` pair so the manifest stays in process, then reads it via + * the standard `ManifestFiles.read` entry point. Each entry is `copy()`-ed so the returned + * `DataFile`s outlive the reader. The 3-arg `GenericManifestFile` constructor is + * package-private and requires `setAccessible(true)` on every supported Iceberg version (1.5.2 + * / 1.8.1 / 1.10.0 verified). + * + * `specId` is stamped onto the synthesised `ManifestFile` because the manifest reader uses it + * to pick the correct partition spec when materialising partition data on each `DataFile`. + */ + def decodeManifestToDataFiles(bytes: Array[Byte], specId: Int): java.util.List[AnyRef] = { + // Iceberg's `ManifestReader.open` infers format from the file extension; the in-memory path + // must end in `.avro` for the v2 data-manifest path to be picked. The "memory:" scheme just + // namespaces the location so it can't collide with on-disk paths in the same FileIO. + val location = s"memory:comet-manifest-${java.util.UUID.randomUUID()}.avro" + val fileIO = newInMemoryFileIO(location, bytes) + val inputFile = newInMemoryInputFile(location, bytes) + val manifestFile = newDataManifestFile(inputFile, specId) + readDataFilesFromManifest(manifestFile, fileIO) + } + + private def newInMemoryFileIO(location: String, bytes: Array[Byte]): AnyRef = { + val cls = loadClass(ClassNames.INMEMORY_FILE_IO) + val instance = cls.getDeclaredConstructor().newInstance().asInstanceOf[AnyRef] + cls + .getMethod("addFile", classOf[String], classOf[Array[Byte]]) + .invoke(instance, location, bytes) + instance + } + + private def newInMemoryInputFile(location: String, bytes: Array[Byte]): AnyRef = { + val cls = loadClass(ClassNames.INMEMORY_INPUT_FILE) + cls + .getConstructor(classOf[String], classOf[Array[Byte]]) + .newInstance(location, bytes) + .asInstanceOf[AnyRef] + } + + /** + * Construct a `GenericManifestFile` pointing at an in-memory data manifest. Two version-skew + * issues to handle: + * + * 1. Constructor shape changed in Iceberg 1.6 when V3's `first_row_id` field was added: + * - 1.5.2 (Spark 3.4 profile): `(InputFile, int)` -- 2-arg + * - 1.6+ (Spark 3.5 / 4.0 profiles): `(InputFile, int, long)` -- 3-arg with `firstRowId` + * Both forms are package-private. We pass `firstRowId = 0` for the V3 variant because + * all our data manifests are V2 (V3 row-lineage is gated as Unsupported in + * `checkTriggers`). + * + * 2. `ManifestFiles.read` on Iceberg 1.5.2 refuses to read a `ManifestFile` whose + * `snapshotId()` is `null` (`InheritableMetadataFactory.fromManifest` throws "Cannot read from + * ManifestFile with null (unassigned) snapshot ID"). Iceberg 1.8+ relaxed that check. We set + * `snapshotId = 0L` via reflection so the read path is happy across versions; the real snapshot + * id is stamped onto the embedded `DataFile`s later by `BatchWrite.commit(messages)`, so the + * placeholder never reaches storage. + */ + private def newDataManifestFile(inputFile: AnyRef, specId: Int): AnyRef = { + val inputFileClass = loadClass(ClassNames.INPUT_FILE) + val cls = loadClass(ClassNames.GENERIC_MANIFEST_FILE) + val (ctor, args): (java.lang.reflect.Constructor[_], Array[Object]) = + try { + val c = cls.getDeclaredConstructor(inputFileClass, classOf[Int], classOf[Long]) + (c, Array[Object](inputFile, Integer.valueOf(specId), java.lang.Long.valueOf(0L))) + } catch { + case _: NoSuchMethodException => + val c = cls.getDeclaredConstructor(inputFileClass, classOf[Int]) + (c, Array[Object](inputFile, Integer.valueOf(specId))) + } + ctor.setAccessible(true) + val manifest = ctor.newInstance(args: _*).asInstanceOf[AnyRef] + try { + val snapshotIdField = cls.getDeclaredField("snapshotId") + snapshotIdField.setAccessible(true) + snapshotIdField.set(manifest, java.lang.Long.valueOf(0L)) + } catch { + case _: NoSuchFieldException => () // field renamed in a future release; soft-fail + } + manifest + } + + private def readDataFilesFromManifest( + manifestFile: AnyRef, + fileIO: AnyRef): java.util.List[AnyRef] = { + val manifestFileClass = loadClass(ClassNames.MANIFEST_FILE) + val fileIOClass = loadClass(ClassNames.FILE_IO) + val contentFileClass = loadClass(ClassNames.CONTENT_FILE) + val readMethod = loadClass(ClassNames.MANIFEST_FILES) + .getMethod("read", manifestFileClass, fileIOClass) + val reader = readMethod.invoke(null, manifestFile, fileIO) + try { + val iterator = reader.getClass + .getMethod("iterator") + .invoke(reader) + .asInstanceOf[java.util.Iterator[AnyRef]] + val result = new java.util.ArrayList[AnyRef]() + val copyMethod = contentFileClass.getMethod("copy") + while (iterator.hasNext) { + result.add(copyMethod.invoke(iterator.next())) + } + result + } finally { + try reader.getClass.getMethod("close").invoke(reader) + catch { + case e: Exception => logWarning(s"Failed to close ManifestReader: ${e.getMessage}") + } + } + } } /** @@ -732,64 +1373,8 @@ object CometIcebergNativeScanMetadata extends Logging { tableSchema = tableSchema, globalFieldIdMapping = globalFieldIdMapping, catalogProperties = catalogProperties, - catalogName = deriveCatalogName(table), + catalogName = IcebergReflection.deriveCatalogName(table), fileFormat = FileFormats.PARQUET) } } - - /** - * Extracts the Spark V2 catalog name from an Iceberg `Table`. `Table.name()` returns - * `catalog.namespace.table` for tables loaded through a catalog; we intersect against the - * registered V2 catalogs so a value like `s3.foo` is not mistaken for a catalog `s3`. Returns - * `None` for HadoopTables loaded by raw path or when reflection fails. - */ - private[iceberg] def deriveCatalogName(table: Any): Option[String] = - deriveCatalogName(table, registeredCatalogNames _) - - /** - * Test seam that lets tests inject a fixed catalog set without bootstrapping a SparkSession. - */ - private[iceberg] def deriveCatalogName( - table: Any, - knownCatalogNames: () => Iterable[String]): Option[String] = { - if (table == null) return None - invokeTableName(table).flatMap { name => - if (name.isEmpty || name == "null") { - None - } else { - knownCatalogNames() - .find(c => name == c || name.startsWith(c + ".")) - .orElse { - val idx = name.indexOf('.') - if (idx > 0) Some(name.substring(0, idx)) else None - } - } - } - } - - private def invokeTableName(table: Any): Option[String] = { - try { - table.getClass.getMethod("name").invoke(table) match { - case s: String => Some(s) - case other if other != null => Some(other.toString) - case null => None - } - } catch { - case e: Exception => - logWarning( - s"Iceberg reflection: Table.name() not callable on ${table.getClass.getName}. " + - "Native S3 credential dispatch will fall back to bucket-keyed isolation: " + - s"${e.getMessage}") - None - } - } - - private def registeredCatalogNames(): Iterable[String] = - try { - SparkSession.active.sessionState.catalogManager.listCatalogs(None) - } catch { - case e: Exception => - logDebug(s"Could not list V2 catalogs from SparkSession: ${e.getMessage}") - Nil - } } diff --git a/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteLogical.scala b/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteLogical.scala new file mode 100644 index 0000000000..77a6e7669d --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteLogical.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} +import org.apache.spark.sql.connector.write.BatchWrite + +/** Logical anchor for the writer. See `IcebergWriteStrategy` for the rationale. */ +case class IcebergWriteLogical( + child: LogicalPlan, + // Logical plans stay driver-side: AQE re-planning is driver-local, and write commands sit + // at the top of the plan so they aren't cached or checkpointed. + @transient batchWrite: BatchWrite, + replaceDataDispatch: Option[ReplaceDataDispatchInfo] = None) + extends UnaryNode { + + override def output: Seq[Attribute] = Nil + + override protected def withNewChildInternal(newChild: LogicalPlan): IcebergWriteLogical = + copy(child = newChild) +} diff --git a/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteStrategy.scala b/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteStrategy.scala new file mode 100644 index 0000000000..191b224186 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/iceberg/IcebergWriteStrategy.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceData} +import org.apache.spark.sql.comet.{IcebergCommitExec, IcebergWriteExec} +import org.apache.spark.sql.connector.write.Write +import org.apache.spark.sql.execution.{SparkPlan, SparkStrategy} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation + +import org.apache.comet.CometConf + +/** + * Spark Strategy that intercepts Iceberg V2 copy-on-write logical writes and emits Comet's + * two-operator physical tree (`IcebergCommitExec` committer + `IcebergWriteExec` writer) instead + * of the default `V2ExistingTableWriteExec` family. + * + * Registered via `SparkSessionExtensions.injectPlannerStrategy` so it runs in + * `experimentalMethods.extraStrategies`, ahead of Spark's built-in `DataSourceV2Strategy` (the + * strategy list is concatenated in that order in `SparkPlanner.strategies`). Returning a + * non-empty `Seq[SparkPlan]` short-circuits later strategies for the matched logical node. + * + * Guarded by [[CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED]] (off by default) and by + * [[IcebergReflection.isIcebergSparkWrite]] -- non-Iceberg V2 writes fall through to Spark. + * + * Covers `AppendData` (INSERT INTO), `OverwriteByExpression` (INSERT OVERWRITE with a static + * predicate), `OverwritePartitionsDynamic` (INSERT OVERWRITE with dynamic partition overwrite + * mode), and `ReplaceData` (copy-on-write DML). `WriteDelta` (merge-on-read DML) is intentionally + * left for Spark's default path: the per-task `DeltaWriter` is row-dispatched and the native + * writer cannot emit position-delete files, so the split-operator plan would only add planning + * complexity for no realisable acceleration. + */ +case class IcebergWriteStrategy(session: SparkSession) extends SparkStrategy { + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = { + if (!CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.get(session.sessionState.conf)) { + return Nil + } + + plan match { + case ad: AppendData => + matchedSparkWrite(ad.table, ad.write, ad.query, replaceDataDispatch = None).toList + case obe: OverwriteByExpression => + matchedSparkWrite(obe.table, obe.write, obe.query, replaceDataDispatch = None).toList + case opd: OverwritePartitionsDynamic => + matchedSparkWrite(opd.table, opd.write, opd.query, replaceDataDispatch = None).toList + case rd: ReplaceData => + // CoW DML: refresh cache for the *original* table relation, mirroring Spark's + // `DataSourceV2Strategy` case (which uses the second `DataSourceV2Relation` argument). The + // `write` carries a `SparkWrite$CopyOnWriteOperation` -- still recognised by + // `isIcebergSparkWrite`. The per-version shim extracts the 4.x `ReplaceDataProjections` + // (absent on 3.4 / 3.5, returning None) so the writer can dispatch the + // `__row_operation`-prefixed row stream. + matchedSparkWrite( + rd.originalTable, + rd.write, + rd.query, + replaceDataDispatch = IcebergReplaceDataShim.extractProjections(rd)).toList + // Iceberg 1.5.2 (Spark 3.4 + iceberg-spark-extensions) ships its own `ReplaceIcebergData` + // logical write node because Spark 3.4 lacks native row-level operation support for + // UPDATE / MERGE on V2 tables. Field shape matches Spark 3.5's stock node one-for-one; + // matched by FQCN so the main module stays free of an iceberg-spark-extensions compile + // dep. On 3.5+ / Iceberg 1.8+ the class doesn't exist, the guard returns false, and the + // case is a no-op. + case plan if IcebergReflection.isReplaceIcebergData(plan) => + IcebergReflection + .extractReplaceIcebergDataFields(plan) + .flatMap { case (_, query, originalTable, write) => + matchedSparkWrite( + originalTable.asInstanceOf[org.apache.spark.sql.catalyst.analysis.NamedRelation], + write.asInstanceOf[Option[Write]], + query.asInstanceOf[LogicalPlan], + replaceDataDispatch = None) + } + .toList + // Hit by AQE. + case IcebergWriteLogical(child, batchWrite, replaceDataDispatch) => + Seq(IcebergWriteExec(batchWrite, planLater(child), replaceDataDispatch)) + case _ => Nil + } + } + + /** + * Common path for the four `Write`-backed logical write nodes. Guarded on + * `IcebergReflection.isIcebergSparkWrite` so non-Iceberg DSv2 writes fall through. The + * `replaceDataDispatch` is `Some` only for Spark 4.x `ReplaceData` (extracted via shim), and + * controls the per-row dispatch in the writer. + */ + private def matchedSparkWrite( + table: org.apache.spark.sql.catalyst.analysis.NamedRelation, + write: Option[Write], + query: LogicalPlan, + replaceDataDispatch: Option[ReplaceDataDispatchInfo]): Option[SparkPlan] = { + table match { + case rel: DataSourceV2Relation => + write.flatMap { w => + if (IcebergReflection.isIcebergSparkWrite(w)) { + Some(buildTwoOp(w, rel, query, replaceDataDispatch)) + } else { + None + } + } + case _ => None + } + } + + /** + * Construct the two-op tree. + * + * - The `BatchWrite` is materialised once and shared between the committer and the writer. + * Iceberg's `Write.toBatch()` mints a fresh `BatchWrite` on every call, and for + * copy-on-write DML the BatchWrite carries the scan state and emitted-file tracking that + * the committer's validation needs to see -- so both execs must run against the same + * instance. + * + * - The writer's child plan is wrapped in `IcebergWriteLogical` to give AQE a stable logical + * anchor each time a stage materialises. Every physical node in Spark carries a + * `logicalLink` pointing back at the logical-plan node it was produced from. AQE uses those + * links as the unit of re-optimisation: when a stage materialises and new runtime stats + * arrive, AQE picks up `logicalLink`, re-runs the optimizer + the SparkPlanner against that + * logical subtree, and substitutes the result back into the physical tree. What that + * subtree contains is therefore what gets re-emitted on every iteration. We aim the + * writer's link at `IcebergWriteLogical` so each re-plan re-emits just the writer (via the + * second `case IcebergWriteLogical(...)` clause in `apply`). This keeps the commit singular + * and lets the data sub-query underneath be optimised by AQE as expected. + * + * - `refreshCache` is what Spark normally runs after a V2 write to invalidate any cached + * `DataFrame` / `Dataset` views of this table so subsequent reads see the new snapshot. + * Since we replace Spark's V2 write exec we have to invoke it ourselves; the committer runs + * it after the snapshot commits. The shim hides Spark 3.x vs 4.x's diverging `SparkSession` + * shape. + */ + private def buildTwoOp( + write: Write, + rel: DataSourceV2Relation, + query: LogicalPlan, + replaceDataDispatch: Option[ReplaceDataDispatchInfo]): SparkPlan = { + val batchWrite = write.toBatch + val refresh: () => Unit = () => IcebergRefreshCacheShim.recacheByPlan(rel) + IcebergCommitExec( + batchWrite, + refresh, + planLater(IcebergWriteLogical(query, batchWrite, replaceDataDispatch))) + } +} diff --git a/spark/src/main/scala/org/apache/comet/iceberg/ReplaceDataDispatchInfo.scala b/spark/src/main/scala/org/apache/comet/iceberg/ReplaceDataDispatchInfo.scala new file mode 100644 index 0000000000..8f9274a9e2 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/iceberg/ReplaceDataDispatchInfo.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.catalyst.ProjectingInternalRow + +/** + * Mirror of Spark 4.x's `ReplaceDataProjections`, defined here so we can compile against Spark + * 3.4 / 3.5 source trees (where the class doesn't exist). Populated by `IcebergReplaceDataShim` + * on 4.x and left `None` on 3.x; consumed by `IcebergWriteExec`. + */ +case class ReplaceDataDispatchInfo( + rowProjection: ProjectingInternalRow, + metadataProjection: Option[ProjectingInternalRow]) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index d116d2f407..14806bee9b 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -296,6 +296,15 @@ case class CometExecRule(session: SparkSession) case op: DataWritingCommandExec => convertToComet(op, CometDataWritingCommand).getOrElse(op) + // AQE re-fires `IcebergWriteStrategy` on every stage materialisation, so a partitioned + // write's physical sub-tree may already contain a `CometIcebergWriteExec`. Unwrap to + // avoid a double conversion. See `IcebergWriteStrategy.buildTwoOp` for the full pattern. + case op: IcebergWriteExec if op.child.isInstanceOf[CometIcebergWriteExec] => + op.child + + case op: IcebergWriteExec => + convertToComet(op, CometIcebergNativeWrite).getOrElse(op) + // For AQE broadcast stage on a Comet broadcast exchange case s @ BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _) => convertToComet(s, CometExchangeSink).getOrElse(s) @@ -660,14 +669,16 @@ case class CometExecRule(session: SparkSession) firstNativeOp = true } - // CometNativeWriteExec is special: it has two separate plans: + // CometNativeWriteExec / CometIcebergWriteExec are special: they have two separate + // plans: // 1. A protobuf plan (nativeOp) describing the write operation // 2. A Spark plan (child) that produces the data to write // The serializedPlanOpt is a def that always returns Some(...) by serializing - // nativeOp on-demand, so it doesn't need convertBlock(). However, its child - // (e.g., CometNativeScanExec) may need its own serialization. Reset the flag - // so children can start their own native execution blocks. - if (op.isInstanceOf[CometNativeWriteExec]) { + // nativeOp on-demand, so the write exec itself doesn't need convertBlock(). However, + // its child (e.g., CometNativeScanExec, or a CometProject over an AQEShuffleRead) + // needs its own serialization. Reset the flag so children can start their own native + // execution blocks. + if (op.isInstanceOf[CometNativeWriteExec] || op.isInstanceOf[CometIcebergWriteExec]) { firstNativeOp = true } @@ -687,7 +698,7 @@ case class CometExecRule(session: SparkSession) // children are CometNativeExec. This prevents runtime failures when the native operator // expects Arrow arrays but receives non-Arrow data (e.g., OnHeapColumnVector). if (serde.requiresNativeChildren && op.children.nonEmpty) { - // Get the actual data-producing children (unwrap WriteFilesExec if present) + // Get the actual data-producing children (unwrap WriteFilesExec). val dataProducingChildren = op.children.flatMap { case writeFiles: WriteFilesExec => Seq(writeFiles.child) case other => Seq(other) diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala index 60fb65277e..82d0e1503d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala +++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometDataWritingCommand.scala @@ -36,7 +36,6 @@ import org.apache.comet.CometSparkSessionExtensions.withFallbackReason import org.apache.comet.objectstore.NativeConfig import org.apache.comet.serde.{CometOperatorSerde, Incompatible, OperatorOuterClass, SupportLevel, Unsupported} import org.apache.comet.serde.OperatorOuterClass.Operator -import org.apache.comet.serde.QueryPlanSerde.serializeDataType /** * CometOperatorSerde implementation for DataWritingCommandExec that converts Parquet write @@ -93,29 +92,13 @@ object CometDataWritingCommand extends CometOperatorSerde[DataWritingCommandExec try { val cmd = op.cmd.asInstanceOf[InsertIntoHadoopFsRelationCommand] - val scanOp = OperatorOuterClass.Scan - .newBuilder() - .setSource(cmd.query.nodeName) - .setArrowFfiSafe(false) - - // Add fields from the query output schema - val scanTypes = cmd.query.output.flatMap { attr => - serializeDataType(attr.dataType) - } - - if (scanTypes.length != cmd.query.output.length) { - withFallbackReason(op, "Cannot serialize data types for native write") - return None + val scanOperator = NativeWriteUtils.buildFfiScan(cmd.query, op.id, ffiSafe = false) match { + case Some(scan) => scan + case None => + withFallbackReason(op, "Cannot serialize data types for native write") + return None } - scanTypes.foreach(scanOp.addFields) - - val scanOperator = Operator - .newBuilder() - .setPlanId(op.id) - .setScan(scanOp.build()) - .build() - val outputPath = cmd.outputPath.toString val codec = parseCompressionCodec(cmd) match { diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometIcebergNativeWrite.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometIcebergNativeWrite.scala new file mode 100644 index 0000000000..9c1555130d --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometIcebergNativeWrite.scala @@ -0,0 +1,641 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde.operator + +import java.util.Locale + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.comet.{CometIcebergWriteExec, CometNativeExec, IcebergWriteExec} + +import org.apache.comet.{CometConf, ConfigEntry} +import org.apache.comet.CometSparkSessionExtensions.withFallbackReason +import org.apache.comet.iceberg.IcebergReflection +import org.apache.comet.serde.{CometOperatorSerde, Compatible, OperatorOuterClass, SupportLevel, Unsupported} +import org.apache.comet.serde.OperatorOuterClass.Operator +import org.apache.comet.serde.QueryPlanSerde.exprToProto + +/** + * Detection + serde for Comet's **native** Iceberg V2 write path. Converts an + * [[IcebergWriteExec]] (the JVM-path writer emitted by `IcebergWriteStrategy`) into a + * [[CometIcebergWriteExec]] that drives iceberg-rust via Comet's native execution pipeline. + * + * Gated by [[CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED]] (off by default) and by a long list + * of fall-back triggers (table format, encryption, custom location providers, bloom filters, ...) + * -- every trigger is pinned by `CometIcebergWriteDetectionSuite`. `requiresNativeChildren = + * true` so the conversion only fires when the upstream plan is fully Comet-native and Arrow + * batches flow without an intermediate Spark-to-Arrow boundary. + * + * Spark 4.x's `ReplaceData` row stream needs a column-strip projection -- see + * `dropNonDataColumns` below. + */ +object CometIcebergNativeWrite extends CometOperatorSerde[IcebergWriteExec] with Logging { + + override def enabledConfig: Option[ConfigEntry[Boolean]] = + Some(CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED) + + override def requiresNativeChildren: Boolean = true + + /** + * Iceberg `TableProperties` keys looked up reflectively the first time the gate runs. + * Centralising them here keeps the property strings identical to Iceberg's own canonical names + * (rather than duplicating literals that could drift across versions). + */ + object PropertyKeys { + lazy val DefaultFileFormat: String = + IcebergReflection.tablePropertyConstant("DEFAULT_FILE_FORMAT") + lazy val DefaultFileFormatValue: String = + IcebergReflection.tablePropertyConstant("DEFAULT_FILE_FORMAT_DEFAULT") + lazy val ObjectStoreEnabled: String = + IcebergReflection.tablePropertyConstant("OBJECT_STORE_ENABLED") + lazy val WriteLocationProviderImpl: String = + IcebergReflection.tablePropertyConstant("WRITE_LOCATION_PROVIDER_IMPL") + lazy val DefaultWriteMetricsMode: String = + IcebergReflection.tablePropertyConstant("DEFAULT_WRITE_METRICS_MODE") + lazy val MetricsModeColumnPrefix: String = + IcebergReflection.tablePropertyConstant("METRICS_MODE_COLUMN_CONF_PREFIX") + lazy val ParquetBloomFilterMaxBytes: String = + IcebergReflection.tablePropertyConstant("PARQUET_BLOOM_FILTER_MAX_BYTES") + lazy val BloomFilterColumnEnabledPrefix: String = + IcebergReflection.tablePropertyConstant("PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX") + lazy val MetricsMaxInferredColumnDefaults: String = + IcebergReflection.tablePropertyConstant("METRICS_MAX_INFERRED_COLUMN_DEFAULTS") + lazy val MetricsMaxInferredColumnDefaultsValue: Int = + IcebergReflection.tablePropertyIntConstant("METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT") + lazy val ParquetRowGroupCheckMinRecordCount: String = + IcebergReflection.tablePropertyConstant("PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT") + lazy val ParquetRowGroupCheckMinRecordCountDefault: Int = + IcebergReflection.tablePropertyIntConstant( + "PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT") + lazy val ParquetRowGroupCheckMaxRecordCount: String = + IcebergReflection.tablePropertyConstant("PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT") + lazy val ParquetRowGroupCheckMaxRecordCountDefault: Int = + IcebergReflection.tablePropertyIntConstant( + "PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT") + // Properties referenced as literals -- owned by parquet-mr / a catalog layer outside Iceberg's + // `TableProperties` enum, so there is no constant to quote. + val ParquetEnableDictionary: String = "parquet.enable.dictionary" + val FileIOImpl: String = "io-impl" + } + + private val EncryptionPropertyPrefix = "encryption." + private val CountsMetricsMode = "counts" + private val NoneMetricsMode = "none" + // Storage backends the native iceberg_common::storage_factory_for resolves. Anything else + // (hdfs, abfs, abfss, wasb, wasbs, ...) is supported in iceberg-java but would throw at + // runtime in native, so we gate on the URI scheme here. + private val SupportedStorageSchemes: Set[String] = + Set("file", "memory", "s3", "s3a", "gs", "oss") + // V3 introduces row lineage / variant types / DVs / etc. that iceberg-rust does not emit. + private val MinUnsupportedFormatVersion = 3 + + override def getSupportLevel(op: IcebergWriteExec): SupportLevel = { + val triggers = checkTriggers(op) + triggers match { + case Some(reason) => Unsupported(Some(reason)) + case None => Compatible(None) + } + } + + /** + * Sequence of fall-back gates. Returns the first reason a write should fall back to the JVM + * path, or `None` when all gates pass. Each gate mirrors a behaviour where iceberg-rust + * diverges from iceberg-java; the goal is to write byte-identical Parquet files only when we + * can. + */ + private def checkTriggers(op: IcebergWriteExec): Option[String] = { + val batchWrite = op.batchWrite + if (!IcebergReflection.isIcebergBatchWrite(batchWrite)) { + // Defensive: the strategy only emits IcebergWriteExec for Iceberg writes, but the + // BatchWrite is opaque past `toBatch` -- a future Iceberg version could expose a different + // class hierarchy here. + return Some( + s"BatchWrite is not an Iceberg `SparkWrite` (got ${batchWrite.getClass.getName})") + } + + // `batchWrite` is an inner class of `SparkWrite` (e.g. `SparkWrite$BatchAppend`); the + // `Table` reference lives on the outer instance. + val sparkWrite = IcebergReflection + .getOuterSparkWrite(batchWrite) + .getOrElse(return Some("Could not unwrap outer `SparkWrite` from `BatchWrite`")) + val table = IcebergReflection + .getTableFromSparkWrite(sparkWrite) + .getOrElse(return Some("Iceberg `SparkWrite.table` reflection returned null")) + + val properties = IcebergReflection + .getTableProperties(table) + .map(_.asScala.toMap) + .getOrElse(Map.empty[String, String]) + + val context = TriggerContext(table, properties, sparkWrite) + triggers.iterator.flatMap(rule => rule(context)).find(_.nonEmpty) + } + + // -- Trigger implementations ------------------------------------------------------------------- + + /** + * Inputs available to every trigger. `sparkWrite` is needed for cases where Iceberg's effective + * value diverges from the raw table property (e.g. the resolved data file format comes from + * `SparkWriteConf.dataFileFormat()`, not just `write.format.default`). + */ + private case class TriggerContext(table: Any, properties: Map[String, String], sparkWrite: Any) + + private type TriggerRule = TriggerContext => Option[String] + + private lazy val triggers: Seq[TriggerRule] = Seq( + requireFormatParquet, + requirePropertyAbsentOrNotTrue( + PropertyKeys.ObjectStoreEnabled, + "iceberg-rust has no hashed-prefix object-storage location generator"), + requirePropertyAbsent( + PropertyKeys.WriteLocationProviderImpl, + "custom location providers are not supported"), + requireFormatVersionAtMostTwo, + requireNoEncryptionPrefix, + requireMetricsModeIsNotCounts, + requireMetricsModeIsNotNone, + requireNoPerColumnCountsMetricsMode, + requireNoPerColumnNoneMetricsMode, + requirePropertyAbsent( + PropertyKeys.ParquetBloomFilterMaxBytes, + "parquet-rs has no global bloom-filter byte cap; explicit value would diverge"), + requireNoBloomFilterColumnsEnabled, + requireSchemaWithinMaxInferredColumnDefaults, + requireRowGroupCheckMinRecordCountAtDefault, + requireRowGroupCheckMaxRecordCountAtDefault, + requirePropertyAbsent( + PropertyKeys.ParquetEnableDictionary, + "parquet-rs follows parquet-mr defaults; an explicit override would diverge"), + requireNoPerColumnStatsEnabled, + requirePropertyAbsent( + PropertyKeys.FileIOImpl, + "custom FileIO is not honoured by the native writer's URI-scheme-based storage selection"), + requireSupportedStorageScheme) + + /** + * Iceberg's `SparkWriteConf.dataFileFormat()` checks the per-write `write-format` option BEFORE + * falling back to the `write.format.default` table property. Gating on the raw table property + * alone produces a false-pass (table default parquet, write option `orc` -> Java writes ORC, + * native would write parquet) and a symmetric false-fall-back; both diverge from Java. Prefer + * the resolved `SparkWrite.format` when available; fall back to the table property only when + * reflection misses (e.g. an Iceberg version we have not vetted). + */ + private val requireFormatParquet: TriggerRule = ctx => { + val resolved = IcebergReflection.getFormatFromSparkWrite(ctx.sparkWrite) + val effective = resolved.getOrElse { + val key = PropertyKeys.DefaultFileFormat + val default = PropertyKeys.DefaultFileFormatValue + ctx.properties.getOrElse(key, default).toLowerCase(Locale.ROOT) + } + if (effective == "parquet") None + else Some(s"resolved write format=$effective (only parquet is supported)") + } + + private def requirePropertyAbsentOrNotTrue(key: String, reason: String): TriggerRule = + ctx => { + if (ctx.properties.get(key).exists(_.equalsIgnoreCase("true"))) { + Some(s"$key=true ($reason)") + } else { + None + } + } + + private def requirePropertyAbsent(key: String, reason: String): TriggerRule = + ctx => { + if (ctx.properties.contains(key)) Some(s"$key is set ($reason)") else None + } + + private val requireFormatVersionAtMostTwo: TriggerRule = ctx => + IcebergReflection.getFormatVersion(ctx.table) match { + case Some(v) if v >= MinUnsupportedFormatVersion => + Some( + s"format-version=$v introduces features the Rust writer cannot emit " + + "(row lineage, variant types, deletion vectors, revised manifest rules)") + case _ => None + } + + private val requireNoEncryptionPrefix: TriggerRule = ctx => + ctx.properties.keys + .find(_.startsWith(EncryptionPropertyPrefix)) + .map(k => s"$k is set (Parquet modular encryption not supported)") + + private val requireMetricsModeIsNotCounts: TriggerRule = ctx => { + val key = PropertyKeys.DefaultWriteMetricsMode + ctx.properties + .get(key) + .filter(_.toLowerCase(Locale.ROOT).contains(CountsMetricsMode)) + .map(v => + s"$key=$v mentions '$CountsMetricsMode' (Parquet has no counts-without-bounds mode)") + } + + /** + * iceberg-java with `metrics.default=none` emits a `DataFile` with no per-column metrics. + * iceberg-rust always populates `column_sizes` / `value_counts` / `null_value_counts` / bounds + * from the parquet footer regardless of metrics mode (`parquet_writer.rs:363-397`), so a native + * write would produce manifests strictly richer than Java's. Fall back rather than diverge. + */ + private val requireMetricsModeIsNotNone: TriggerRule = ctx => { + val key = PropertyKeys.DefaultWriteMetricsMode + ctx.properties + .get(key) + .filter(_.trim.toLowerCase(Locale.ROOT) == NoneMetricsMode) + .map(_ => s"$key=$NoneMetricsMode (iceberg-rust always populates per-column metrics)") + } + + private val requireNoPerColumnCountsMetricsMode: TriggerRule = ctx => { + val prefix = PropertyKeys.MetricsModeColumnPrefix + ctx.properties + .find { case (k, v) => + k.startsWith(prefix) && v.toLowerCase(Locale.ROOT) == CountsMetricsMode + } + .map { case (k, _) => s"$k=$CountsMetricsMode (counts-only mode not supported)" } + } + + /** Same rationale as the default-mode `none` gate, applied per column. */ + private val requireNoPerColumnNoneMetricsMode: TriggerRule = ctx => { + val prefix = PropertyKeys.MetricsModeColumnPrefix + ctx.properties + .find { case (k, v) => + k.startsWith(prefix) && v.trim.toLowerCase(Locale.ROOT) == NoneMetricsMode + } + .map { case (k, _) => + s"$k=$NoneMetricsMode (iceberg-rust always populates per-column metrics)" + } + } + + private val requireNoBloomFilterColumnsEnabled: TriggerRule = ctx => { + val prefix = PropertyKeys.BloomFilterColumnEnabledPrefix + ctx.properties + .find { case (k, v) => k.startsWith(prefix) && v.equalsIgnoreCase("true") } + .map { case (k, _) => + s"$k=true (iceberg-rust cannot enforce iceberg-java's bloom-filter byte cap)" + } + } + + /** + * Per-column parquet stats-enabled overrides. `write.parquet.stats-enabled.column.` + * (`TableProperties.PARQUET_COLUMN_STATS_ENABLED_PREFIX`) was only added in Iceberg 1.10.0; on + * 1.5.2 / 1.8.1 the key is silently ignored by Iceberg-Java, so a native write matches Java + * without falling back. Gate only on the versions where the property has an effect -- detected + * by the presence of the constant rather than hard-coding a version. + */ + private val requireNoPerColumnStatsEnabled: TriggerRule = ctx => + IcebergReflection.tablePropertyConstantOpt("PARQUET_COLUMN_STATS_ENABLED_PREFIX").flatMap { + prefix => + ctx.properties + .find { case (k, _) => k.startsWith(prefix) } + .map { case (k, v) => + s"$k=$v (per-column parquet stats overrides not honoured by the native writer)" + } + } + + /** + * Native iceberg_common::storage_factory_for resolves only a fixed set of URI schemes; tables + * whose data location uses any other scheme (hdfs, abfs, wasb, ...) would plan natively then + * crash at write time. Gate on the URI scheme of `Table.locationProvider().newDataLocation()`. + */ + private val requireSupportedStorageScheme: TriggerRule = ctx => { + val location = IcebergReflection.getDataLocation(ctx.table).getOrElse("") + val scheme = if (location.contains("://")) { + location.substring(0, location.indexOf("://")).toLowerCase(Locale.ROOT) + } else { + "file" + } + if (SupportedStorageSchemes.contains(scheme)) { + None + } else { + Some( + s"data location scheme '$scheme' is not supported by the native writer " + + s"(supported: ${SupportedStorageSchemes.toSeq.sorted.mkString(", ")})") + } + } + + private val requireSchemaWithinMaxInferredColumnDefaults: TriggerRule = ctx => { + // Iceberg only applies the inferred-column truncation (first N projected IDs -> truncate(16), + // every column beyond -> MetricsModes.None) when NO explicit `write.metadata.metrics.default` + // is configured: a user-set default is applied to all columns regardless of count + // (`MetricsConfig.from`). So this gate is irrelevant once a default mode is present -- the + // metrics-mode gates above already cover whether that mode itself is supported. + if (ctx.properties.contains(PropertyKeys.DefaultWriteMetricsMode)) { + None + } else { + val key = PropertyKeys.MetricsMaxInferredColumnDefaults + val limit = + parseIntProperty(ctx.properties, key, PropertyKeys.MetricsMaxInferredColumnDefaultsValue) + (for { + schema <- IcebergReflection.getSchema(ctx.table) + count <- IcebergReflection.getProjectedFieldIdCount(schema) + reason <- + if (count > limit) { + Some( + s"schema has $count projected field IDs which exceeds $key=$limit " + + "(iceberg-java applies MetricsModes.None to columns beyond this index)") + } else { + None + } + } yield reason) + } + } + + private def parseIntProperty(props: Map[String, String], key: String, default: Int): Int = + props.get(key).flatMap(s => scala.util.Try(s.trim.toInt).toOption).getOrElse(default) + + private val requireRowGroupCheckMinRecordCountAtDefault: TriggerRule = + requireIntPropertyAtDefault( + PropertyKeys.ParquetRowGroupCheckMinRecordCount, + PropertyKeys.ParquetRowGroupCheckMinRecordCountDefault, + "parquet-rs uses a different row-group sizing cadence; non-default value would diverge") + + private val requireRowGroupCheckMaxRecordCountAtDefault: TriggerRule = + requireIntPropertyAtDefault( + PropertyKeys.ParquetRowGroupCheckMaxRecordCount, + PropertyKeys.ParquetRowGroupCheckMaxRecordCountDefault, + "parquet-rs uses a different row-group sizing cadence; non-default value would diverge") + + private def requireIntPropertyAtDefault( + key: String, + default: Int, + reason: String): TriggerRule = ctx => + ctx.properties.get(key).flatMap { raw => + scala.util.Try(raw.trim.toInt).toOption match { + case Some(v) if v != default => Some(s"$key=$v (default=$default; $reason)") + case Some(_) => None + case None => Some(s"$key=$raw is not an int ($reason)") + } + } + + // -- Conversion -------------------------------------------------------------------------------- + + override def convert( + op: IcebergWriteExec, + builder: Operator.Builder, + childOp: Operator*): Option[OperatorOuterClass.Operator] = { + val _ = (builder, childOp) // unused: we synthesise our own FFI scan child below + try { + for { + icebergWrite <- buildIcebergWriteProto(op) + ffiScan <- buildFfiScan(op) + writeChild <- dropNonDataColumns(op, ffiScan) + } yield OperatorOuterClass.Operator + .newBuilder() + .setPlanId(op.id) + .addChildren(writeChild) + .setIcebergWrite(icebergWrite) + .build() + } catch { + case e: Exception => + withFallbackReason(op, s"Failed to convert Iceberg native write: ${e.getMessage}") + None + } + } + + /** + * Spark 4.x rewrites CoW DML (`ReplaceData`) into a row stream with extra columns -- column 0 + * carries the per-row operation code (`__row_operation`: 5=WRITE, 6=WRITE_WITH_METADATA) and + * the tail of the row carries file/partition metadata (`_file`, `_spec_id`, `_partition`). The + * JVM-side `IcebergWriteExec.runReplaceDataWriter` handles this row-by-row by applying + * `dispatch.rowProjection` before invoking the writer. + * + * The native path forwards Arrow batches as-is to the iceberg-rust writer, which expects + * exactly the Iceberg table's data columns. Without an explicit projection step we end up + * giving it the wider row (e.g. 6 columns when the schema has 3) and + * `decorate_batch_with_field_ids` rejects the batch. + * + * For Spark 3.4 / 3.5 the strategy shim returns `None` for `replaceDataDispatch` and the + * upstream plan already projects to the data columns -- no extra projection needed. For 4.x we + * splice a `Projection` proto between our `IcebergWrite` op and the FFI `Scan`, selecting the + * upstream attributes whose names match the Iceberg schema's columns. The JVM-side child stays + * at the original wide output, so its `executeColumnar()` still emits the wide batches the FFI + * scan declares; the projection then strips them inside the native runtime before the writer + * sees the data. + */ + private def dropNonDataColumns( + op: IcebergWriteExec, + scan: OperatorOuterClass.Operator): Option[OperatorOuterClass.Operator] = { + if (op.replaceDataDispatch.isEmpty) return Some(scan) + + val sparkWrite = IcebergReflection.getOuterSparkWrite(op.batchWrite).getOrElse { + withFallbackReason(op, "Could not unwrap outer SparkWrite for ReplaceData projection") + return None + } + val writeSchema = IcebergReflection.getWriteSchemaFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason( + op, + "SparkWrite.writeSchema reflection failed for ReplaceData projection") + return None + } + val dataFieldNames = IcebergReflection.getSchemaFieldNames(writeSchema).getOrElse { + withFallbackReason( + op, + "Could not extract Iceberg schema column names for ReplaceData projection") + return None + } + val upstreamOutput = op.child.output + val missing = dataFieldNames.filterNot(name => upstreamOutput.exists(_.name == name)) + if (missing.nonEmpty) { + withFallbackReason( + op, + s"ReplaceData projection: columns ${missing.mkString("[", ", ", "]")} not in upstream " + + s"output ${upstreamOutput.map(_.name).mkString("[", ", ", "]")}") + return None + } + val projectList = dataFieldNames.map(name => upstreamOutput.find(_.name == name).get) + val protoExprs = projectList.map(attr => exprToProto(attr, upstreamOutput)) + if (!protoExprs.forall(_.isDefined)) { + withFallbackReason(op, "Could not serialise ReplaceData projection attributes to proto") + return None + } + val projection = OperatorOuterClass.Projection + .newBuilder() + .addAllProjectList(protoExprs.map(_.get).asJava) + .build() + Some( + OperatorOuterClass.Operator + .newBuilder() + .setPlanId(op.id) + .addChildren(scan) + .setProjection(projection) + .build()) + } + + private def buildFfiScan(op: IcebergWriteExec): Option[OperatorOuterClass.Operator] = { + val scan = + NativeWriteUtils.buildFfiScan(op.child, op.id, NativeWriteUtils.isUpstreamFfiSafe(op.child)) + if (scan.isEmpty) { + withFallbackReason( + op, + "Cannot serialize upstream data types for Iceberg native write FFI scan") + } + scan + } + + override def createExec(nativeOp: Operator, op: IcebergWriteExec): CometNativeExec = { + val sparkWrite = IcebergReflection + .getOuterSparkWrite(op.batchWrite) + .getOrElse( + throw new IllegalStateException( + "Native Iceberg write conversion: could not unwrap outer SparkWrite from BatchWrite")) + val table = IcebergReflection + .getTableFromSparkWrite(sparkWrite) + .getOrElse( + throw new IllegalStateException( + "Native Iceberg write conversion: SparkWrite.table reflection failed")) + val outputSpecId = IcebergReflection + .getOutputSpecIdFromSparkWrite(sparkWrite) + .getOrElse( + throw new IllegalStateException( + "Native Iceberg write conversion: SparkWrite.outputSpecId reflection failed")) + CometIcebergWriteExec( + nativeOp, + op.child, + op.batchWrite, + table.asInstanceOf[AnyRef], + outputSpecId) + } + + /** + * Assemble the per-write `IcebergWrite` protobuf. All reflection calls are localised here so a + * missing accessor surfaces as a `withFallbackReason` fall-back rather than a planning-time + * crash. + * + * Note: even though `getSupportLevel` is currently returning `Unsupported`, this method stays + * fully implemented so the moment the Avro decoder lands the native path engages without + * additional plumbing work. + */ + private def buildIcebergWriteProto( + op: IcebergWriteExec): Option[OperatorOuterClass.IcebergWrite] = { + val sparkWrite = IcebergReflection.getOuterSparkWrite(op.batchWrite).getOrElse { + withFallbackReason(op, "Could not unwrap outer SparkWrite from BatchWrite") + return None + } + val table = IcebergReflection.getTableFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "Could not extract Iceberg Table from SparkWrite") + return None + } + + val properties = IcebergReflection + .getTableProperties(table) + .map(_.asScala.toMap) + .getOrElse(Map.empty[String, String]) + val catalogProperties = + IcebergReflection.getFileIOProperties(table).getOrElse(Map.empty[String, String]) + + val metadataLocation = IcebergReflection.getMetadataLocation(table).getOrElse { + withFallbackReason(op, "Iceberg Table has no metadata location (metadata-table scan?)") + return None + } + val outputSpecId = IcebergReflection.getOutputSpecIdFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "SparkWrite.outputSpecId reflection failed") + return None + } + val partitionSpec = IcebergReflection.getPartitionSpecById(table, outputSpecId).getOrElse { + withFallbackReason(op, s"No partition spec found for id=$outputSpecId") + return None + } + val partitionSpecJson = IcebergReflection.partitionSpecToJson(partitionSpec).getOrElse { + withFallbackReason(op, "PartitionSpecParser.toJson failed") + return None + } + val writeSchema = IcebergReflection.getWriteSchemaFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "SparkWrite.writeSchema reflection failed") + return None + } + val icebergSchemaJson = IcebergReflection.schemaToJson(writeSchema).getOrElse { + withFallbackReason(op, "SchemaParser.toJson failed") + return None + } + // Iceberg's Spark `SparkWrite$WriterFactory` does NOT wire the table sort order into the + // per-file writer factory for batch appends in any Iceberg version Comet targets (1.5.2 / + // 1.8.1 / 1.10.0): it builds `SparkFileWriterFactory` without `.dataSortOrder(...)`, so every + // committed data file is stamped `sort_order_id = 0` (unsorted) even when the table itself has + // a non-default sort order. We match that exactly. The + // `SparkWriteConf.outputSortOrderId(writeRequirements)` resolver (explicit option / table order + // when an ordering is required / unsorted) and the matching `.dataSortOrder(...)` wiring only + // exist in Iceberg 1.11+; reflect the resolver when it is present so this stays correct if the + // pinned runtime is bumped, otherwise default to 0. + val sortOrderId = + IcebergReflection.getOutputSortOrderIdFromSparkWrite(sparkWrite).getOrElse(0) + val dataLocation = IcebergReflection.getDataLocation(table).getOrElse { + withFallbackReason(op, "Table.locationProvider().newDataLocation reflection failed") + return None + } + val operationId = IcebergReflection.getOperationIdFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "SparkWrite.queryId reflection failed") + return None + } + val targetFileSize = + IcebergReflection.getTargetFileSizeFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "SparkWrite.targetFileSize reflection failed") + return None + } + val useFanoutWriter = + IcebergReflection.getUseFanoutWriterFromSparkWrite(sparkWrite).getOrElse { + withFallbackReason(op, "SparkWrite.useFanoutWriter reflection failed") + return None + } + val specIsUnpartitioned = isUnpartitionedSpec(partitionSpec) + val writerMode = IcebergWriteProtoTranslation.resolveWriterMode( + specIsUnpartitioned = specIsUnpartitioned, + useFanoutWriter = useFanoutWriter) + + val createdBy = s"Apache Iceberg ${IcebergReflection.icebergVersion()} (Comet)" + // Iceberg's `RegistryBasedFileWriterFactory` merges resolved write properties (codec, level, + // and other effective settings carried on `SparkWrite`) over the table's properties when + // building the per-file writer. Mirror that merge here so per-write options (e.g. + // `option("write-parquet-compression-codec", "gzip")`) survive into the native writer. + val resolvedWriteProperties = + IcebergReflection.getWritePropertiesFromSparkWrite(sparkWrite).getOrElse(Map.empty) + val effectiveProperties = properties ++ resolvedWriteProperties + val parquetSettings = + IcebergWriteProtoTranslation.buildParquetSettings(effectiveProperties, createdBy) + + val common = IcebergWriteProtoTranslation.buildCommon( + catalogProperties = catalogProperties, + metadataLocation = metadataLocation, + icebergSchemaJson = icebergSchemaJson, + partitionSpecJson = partitionSpecJson, + sortOrderId = sortOrderId, + dataLocation = dataLocation, + operationId = operationId, + targetFileSizeBytes = targetFileSize, + writerMode = writerMode, + parquetSettings = parquetSettings, + catalogName = IcebergReflection.deriveCatalogName(table)) + + Some(OperatorOuterClass.IcebergWrite.newBuilder().setCommon(common).build()) + } + + /** + * `PartitionSpec.isUnpartitioned()` -- accessed reflectively because Iceberg is `test`-scoped + * on the main source classpath. + */ + private def isUnpartitionedSpec(spec: Any): Boolean = + try { + spec.getClass.getMethod("isUnpartitioned").invoke(spec).asInstanceOf[Boolean] + } catch { + case _: Exception => + val fields = spec.getClass + .getMethod("fields") + .invoke(spec) + .asInstanceOf[java.util.List[_]] + fields.isEmpty + } +} diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslation.scala b/spark/src/main/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslation.scala new file mode 100644 index 0000000000..b67033c6e0 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslation.scala @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde.operator + +import java.util.Locale + +import scala.jdk.CollectionConverters._ +import scala.util.Try + +import org.apache.spark.internal.Logging + +import org.apache.comet.iceberg.IcebergReflection +import org.apache.comet.serde.OperatorOuterClass._ + +/** + * Pure translation from the resolved per-write property map (Iceberg table properties merged with + * any write options) and a small set of driver-side state values into the protobuf messages + * shipped to the Rust writer. + * + * Kept free of any `SparkWrite` reference so each translation function can be unit-tested with + * plain `Map[String, String]` inputs. The serde wires these into the protobuf during conversion; + * the JVM exec wrapper fills in `partition_id` and `task_attempt_id` at task launch. + */ +object IcebergWriteProtoTranslation extends Logging { + + /** + * Iceberg `TableProperties` constants the translation depends on. Resolved lazily through the + * reflection bridge so we always quote Iceberg's canonical names rather than duplicating + * literal strings. + */ + object Keys { + lazy val ParquetCompression: String = + IcebergReflection.tablePropertyConstant("PARQUET_COMPRESSION") + lazy val ParquetCompressionDefaultSince14: String = + IcebergReflection.tablePropertyConstant("PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0") + lazy val ParquetCompressionLevel: String = + IcebergReflection.tablePropertyConstant("PARQUET_COMPRESSION_LEVEL") + lazy val ParquetRowGroupSizeBytes: String = + IcebergReflection.tablePropertyConstant("PARQUET_ROW_GROUP_SIZE_BYTES") + lazy val ParquetPageSizeBytes: String = + IcebergReflection.tablePropertyConstant("PARQUET_PAGE_SIZE_BYTES") + lazy val ParquetPageRowLimit: String = + IcebergReflection.tablePropertyConstant("PARQUET_PAGE_ROW_LIMIT") + lazy val ParquetDictSizeBytes: String = + IcebergReflection.tablePropertyConstant("PARQUET_DICT_SIZE_BYTES") + lazy val MetricsModeDefault: String = + IcebergReflection.tablePropertyConstant("DEFAULT_WRITE_METRICS_MODE") + lazy val MetricsModeDefaultValue: String = + IcebergReflection.tablePropertyConstant("DEFAULT_WRITE_METRICS_MODE_DEFAULT") + lazy val MetricsModeColumnPrefix: String = + IcebergReflection.tablePropertyConstant("METRICS_MODE_COLUMN_CONF_PREFIX") + } + + /** Iceberg's numeric defaults, pulled at runtime so they stay in lock-step with the runtime. */ + object Defaults { + lazy val RowGroupSizeBytes: Long = + IcebergReflection.tablePropertyIntConstant("PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT").toLong + lazy val PageSizeBytes: Long = + IcebergReflection.tablePropertyIntConstant("PARQUET_PAGE_SIZE_BYTES_DEFAULT").toLong + lazy val DictSizeBytes: Long = + IcebergReflection.tablePropertyIntConstant("PARQUET_DICT_SIZE_BYTES_DEFAULT").toLong + lazy val PageRowLimit: Int = + IcebergReflection.tablePropertyIntConstant("PARQUET_PAGE_ROW_LIMIT_DEFAULT") + } + + /** Builds the parquet settings message. Pure: no SparkWrite or Iceberg `Table` access. */ + def buildParquetSettings( + props: Map[String, String], + createdBy: String): IcebergParquetWriteSettings = { + val rowGroupSize = + parseLong(props, Keys.ParquetRowGroupSizeBytes, Defaults.RowGroupSizeBytes) + val pageSize = parseLong(props, Keys.ParquetPageSizeBytes, Defaults.PageSizeBytes) + val dictSize = parseLong(props, Keys.ParquetDictSizeBytes, Defaults.DictSizeBytes) + val pageRowLimit = parseInt(props, Keys.ParquetPageRowLimit, Defaults.PageRowLimit) + val compression = resolveCompression(props) + val builder = IcebergParquetWriteSettings + .newBuilder() + .setCompression(compression) + .setRowGroupSizeBytes(rowGroupSize) + .setPageSizeBytes(pageSize) + .setDictSizeBytes(dictSize) + .setPageRowLimit(pageRowLimit) + .setCreatedBy(createdBy) + + resolveCompressionLevel(props, compression).foreach(builder.setCompressionLevel) + + val rawMetricsMode = props.getOrElse(Keys.MetricsModeDefault, Keys.MetricsModeDefaultValue) + val defaultMetricsMode = parseMetricsMode(rawMetricsMode) + builder.setDefaultStatisticsEnabled(defaultMetricsMode.enabled) + defaultMetricsMode.truncateLength.foreach(builder.setStatisticsTruncateLength) + + collectColumnStatistics(props).foreach(builder.addColumnStatistics) + + builder.build() + } + + /** + * Driver-side resolution of the iceberg-rust writer flavor. The choice mirrors + * `SparkWrite$WriterFactory`: unpartitioned tables get `UnpartitionedDataWriter`; partitioned + * tables either fan out or cluster based on `SparkWriteConf.useFanoutWriter`. + */ + def resolveWriterMode( + specIsUnpartitioned: Boolean, + useFanoutWriter: Boolean): IcebergWriterMode = { + if (specIsUnpartitioned) IcebergWriterMode.ICEBERG_WRITER_UNPARTITIONED + else if (useFanoutWriter) IcebergWriterMode.ICEBERG_WRITER_FANOUT + else IcebergWriterMode.ICEBERG_WRITER_CLUSTERED + } + + /** Builds the per-write broadcast message. */ + // scalastyle:off argcount + def buildCommon( + catalogProperties: Map[String, String], + metadataLocation: String, + icebergSchemaJson: String, + partitionSpecJson: String, + sortOrderId: Int, + dataLocation: String, + operationId: String, + targetFileSizeBytes: Long, + writerMode: IcebergWriterMode, + parquetSettings: IcebergParquetWriteSettings, + catalogName: Option[String]): IcebergWriteCommon = { + val builder = IcebergWriteCommon + .newBuilder() + .setMetadataLocation(metadataLocation) + .setIcebergSchemaJson(icebergSchemaJson) + .setPartitionSpecJson(partitionSpecJson) + .setSortOrderId(sortOrderId) + .setDataLocation(dataLocation) + .setOperationId(operationId) + .setTargetFileSizeBytes(targetFileSizeBytes) + .setWriterMode(writerMode) + .setParquetSettings(parquetSettings) + if (catalogProperties.nonEmpty) builder.putAllCatalogProperties(catalogProperties.asJava) + catalogName.foreach(builder.setCatalogName) + builder.build() + } + // scalastyle:on argcount + + // --- Internals ------------------------------------------------------------ + + /** + * Iceberg metrics-mode string -> (stats on/off, optional truncate length). Grammar is `none`, + * `counts`, `truncate(N)`, or `full`. `counts` is filtered out by the detection rule earlier in + * the pipeline; the parser still recognises it defensively so a stray value yields a clear + * failure mode rather than silently mapping to "full". + */ + private case class MetricsMode(enabled: Boolean, truncateLength: Option[Int]) + + private val TruncatePattern = """truncate\((\d+)\)""".r + + private def parseMetricsMode(value: String): MetricsMode = + value.trim.toLowerCase(Locale.ROOT) match { + case "none" => MetricsMode(enabled = false, truncateLength = None) + case "full" => MetricsMode(enabled = true, truncateLength = None) + case TruncatePattern(n) => + MetricsMode(enabled = true, truncateLength = Some(n.toInt)) + case "counts" => + throw new IllegalArgumentException( + "'counts' metrics mode must be filtered by the detection rule before translation") + case other => + throw new IllegalArgumentException(s"Unrecognised metrics mode: $other") + } + + /** + * Iceberg accepts `uncompressed`, `none` (treated as uncompressed), `snappy`, `gzip`, `lz4`, + * `zstd`, `brotli`. Defaults to `zstd` (since Iceberg 1.4). + */ + private def resolveCompression(props: Map[String, String]): CompressionCodec = { + val raw = props + .get(Keys.ParquetCompression) + .map(_.trim.toLowerCase(Locale.ROOT)) + .getOrElse(Keys.ParquetCompressionDefaultSince14) + raw match { + case "uncompressed" | "none" => CompressionCodec.None + case "snappy" => CompressionCodec.Snappy + case "gzip" => CompressionCodec.Gzip + case "lz4" => CompressionCodec.Lz4 + case "zstd" => CompressionCodec.Zstd + case "brotli" => CompressionCodec.Brotli + case other => + throw new IllegalArgumentException(s"Unsupported parquet codec '$other'") + } + } + + /** + * Iceberg leaves `write.parquet.compression-level` null by default and lets each parquet writer + * pick its own per-codec default. The only codec where parquet-rs and parquet-mr diverge is + * zstd (parquet-rs default 1, parquet-mr default 3). To produce files the same size as + * iceberg-java would, substitute parquet-mr's 3 when zstd is in use and the user did not set an + * explicit level. Other codecs match by accident: gzip 6 = 6, brotli 1 = 1; snappy and lz4 have + * no level concept. + */ + private def resolveCompressionLevel( + props: Map[String, String], + codec: CompressionCodec): Option[Int] = { + val explicit = + props.get(Keys.ParquetCompressionLevel).flatMap(s => Try(s.trim.toInt).toOption) + explicit.orElse(parquetMrDefaultLevel(codec)) + } + + private def parquetMrDefaultLevel(codec: CompressionCodec): Option[Int] = codec match { + case CompressionCodec.Zstd => Some(3) + case _ => None + } + + private def collectColumnStatistics( + props: Map[String, String]): Seq[IcebergParquetColumnStatistics] = { + val modeByColumn = props.collect { + case (k, v) if k.startsWith(Keys.MetricsModeColumnPrefix) => + k.substring(Keys.MetricsModeColumnPrefix.length) -> parseMetricsMode(v) + } + + modeByColumn.keys.toSeq.sorted.map { column => + val mode = modeByColumn(column) + val builder = IcebergParquetColumnStatistics + .newBuilder() + .setColumn(column) + .setEnabled(mode.enabled) + mode.truncateLength.foreach(builder.setTruncateLength) + builder.build() + } + } + + private def parseLong(props: Map[String, String], key: String, default: Long): Long = + props.get(key).flatMap(s => Try(s.trim.toLong).toOption).getOrElse(default) + + private def parseInt(props: Map[String, String], key: String, default: Int): Int = + props.get(key).flatMap(s => Try(s.trim.toInt).toOption).getOrElse(default) + +} diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/NativeWriteUtils.scala b/spark/src/main/scala/org/apache/comet/serde/operator/NativeWriteUtils.scala new file mode 100644 index 0000000000..25d1791f92 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/operator/NativeWriteUtils.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde.operator + +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.comet.{CometLocalTableScanExec, CometScanWrapper, CometSparkToColumnarExec} +import org.apache.spark.sql.execution.SparkPlan + +import org.apache.comet.serde.OperatorOuterClass +import org.apache.comet.serde.QueryPlanSerde.serializeDataType + +/** + * Shared helpers for native write serdes ([[CometDataWritingCommand]] for V1 parquet writes and + * [[CometIcebergNativeWrite]] for V2 Iceberg writes). The synthetic FFI `Scan` they each splice + * under the writer is identical except for the `arrowFfiSafe` decision. + */ +object NativeWriteUtils { + + /** + * Build a synthetic FFI `Scan` operator that lets a native write op consume Arrow batches + * shipped from the JVM iterator over `plan`'s `executeColumnar()` RDD. `arrowFfiSafe` controls + * whether the native runtime may hold buffer pointers across the JNI boundary (`true`, + * zero-copy) or must import every batch via Spark's `ArrowConverters` (`false`). + * + * Returns `None` if any of `plan.output`'s data types can't be serialised to the proto -- in + * that case the caller should fall back with `withFallbackReason`. + */ + def buildFfiScan( + plan: QueryPlan[_], + planId: Int, + ffiSafe: Boolean): Option[OperatorOuterClass.Operator] = { + val scanTypes = plan.output.flatMap(attr => serializeDataType(attr.dataType)) + if (scanTypes.length != plan.output.length) return None + val scan = OperatorOuterClass.Scan + .newBuilder() + .setSource(plan.nodeName) + .setArrowFfiSafe(ffiSafe) + scanTypes.foreach(scan.addFields) + Some( + OperatorOuterClass.Operator + .newBuilder() + .setPlanId(planId) + .setScan(scan.build()) + .build()) + } + + /** + * Whether `plan` promises FFI-safe Arrow output (no array reuse across batches). + * `CometLocalTableScanExec` and `CometSparkToColumnarExec` reuse arrays and must drive the + * native side via the non-FFI path. `CometScanWrapper` peeks past the wrapper since the sink's + * `isFfiSafe` flag isn't surfaced on the wrapper itself. + */ + def isUpstreamFfiSafe(plan: SparkPlan): Boolean = plan match { + case _: CometLocalTableScanExec => false + case _: CometSparkToColumnarExec => false + case wrapper: CometScanWrapper => isUpstreamFfiSafe(wrapper.originalPlan) + case _ => true + } +} diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometIcebergWriteExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometIcebergWriteExec.scala new file mode 100644 index 0000000000..892d1fac19 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometIcebergWriteExec.scala @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection +import org.apache.spark.sql.connector.write.{BatchWrite, WriterCommitMessage} +import org.apache.spark.sql.execution.{ColumnarToRowTransition, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.types.BinaryType +import org.apache.spark.sql.vectorized.ColumnarBatch + +import com.google.protobuf.CodedOutputStream + +import org.apache.comet.CometExecIterator +import org.apache.comet.iceberg.IcebergReflection +import org.apache.comet.serde.OperatorOuterClass.Operator + +/** + * Native variant of [[IcebergWriteExec]]. Drives the iceberg-rust writer stack via Comet's native + * execution pipeline; the JVM side decodes the per-task Avro-encoded `DataFile` blob the native + * operator emits and packages it as a [[WriterCommitMessage]] so the outer [[IcebergCommitExec]] + * consumes it unchanged. + * + * Selected by [[org.apache.comet.serde.operator.CometIcebergNativeWrite]] when the table's + * properties allow it (parquet, V2, no encryption, ...) and the child plan is fully Comet-native; + * otherwise the JVM path's [[IcebergWriteExec]] runs instead. + * + * @param nativeOp + * Template operator carrying the `IcebergWrite` proto. Per-task `partition_id` / + * `task_attempt_id` get stamped on a copy at execution time. + * @param child + * Comet native child (must be a [[CometNativeExec]] so columnar batches flow through FFI). + * @param batchWrite + * Shared with the outer [[IcebergCommitExec]] -- the same instance the strategy materialised + * via `write.toBatch`. Used here only to provide the `dataLocation` / partition spec needed by + * the native side; never invoked for commit. + * @param partitionSpecId + * Output partition spec id (from `SparkWrite.outputSpecId`). Decoded `DataFile`s are stamped + * with this spec id; required because iceberg-rust's `DataFile` is spec-agnostic at the wire. + */ +case class CometIcebergWriteExec( + nativeOp: Operator, + child: SparkPlan, + @transient batchWrite: BatchWrite, + @transient table: AnyRef, + partitionSpecId: Int) + extends CometNativeExec + with UnaryExecNode + // We consume Arrow batches (via FFI) and emit row-shaped commit messages, so we are a + // columnar-to-row transition. Without this trait Spark's + // `ApplyColumnarRulesAndInsertTransitions` wedges a `CometNativeColumnarToRowExec` between + // us and the Comet-native child, which would then fail `child.executeColumnar()` in + // `doExecuteColumnar`. + with ColumnarToRowTransition { + + override def originalPlan: SparkPlan = child + + // Same output schema as IcebergWriteExec so the outer IcebergCommitExec consumes the + // commit messages identically regardless of which inner exec emitted them. + override def output: Seq[Attribute] = Seq( + AttributeReference(IcebergWriteExec.CommitMessageColumn, BinaryType, nullable = false)()) + + // Native exec emits a single Binary column; the surrounding command framework expects rows, so + // the outer commit exec calls executeCollect on us. supportsColumnar = false keeps Spark from + // inserting a ColumnarToRow that would clash with our (Nil-output-like) row contract. + override def supportsColumnar: Boolean = false + + override def executeCollect(): Array[InternalRow] = { + val rdd = doExecute() + // SparkPlan.executeCollect defaults to byteArrayRdd which goes through UnsafeRow encoding; + // doExecute already projects each row through UnsafeProjection (see the per-task closure + // below) so a plain `collect()` is safe. + rdd.collect() + } + + override def serializedPlanOpt: SerializedPlan = { + val size = nativeOp.getSerializedSize + val bytes = new Array[Byte](size) + val codedOutput = CodedOutputStream.newInstance(bytes) + nativeOp.writeTo(codedOutput) + codedOutput.checkNoSpaceLeft() + SerializedPlan(Some(bytes)) + } + + override def withNewChildInternal(newChild: SparkPlan): SparkPlan = copy(child = newChild) + + override def nodeName: String = "CometIcebergWrite" + + // Names mirror Spark's stock `BatchWriteHelper` metrics (`numFiles` / `numOutputRows` / + // `numOutputBytes`) so the Spark SQL UI shows the same row as it would for a non-Comet + // Iceberg write. + override lazy val metrics: Map[String, SQLMetric] = Map( + "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files written"), + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "numOutputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "written output")) + + override def doExecute(): RDD[InternalRow] = { + val columnarRdd = doExecuteColumnar() + val specId = partitionSpecId + val sortOrderId = nativeOp.getIcebergWrite.getCommon.getSortOrderId + val schemaTypes = output.map(_.dataType).toArray + val filesMetric = longMetric("numFiles") + val rowsMetric = longMetric("numOutputRows") + val bytesMetric = longMetric("numOutputBytes") + columnarRdd.mapPartitionsInternal { batches => + // Per-task: drain the native output (single-row Binary column carrying one V2 data + // manifest), decode it via Iceberg's `ManifestFiles.read` to recover `DataFile`s, stamp + // the table's `sort_order_id` on each (iceberg-rust's writer doesn't expose the field, so + // we re-apply it here to match what `SparkWrite` would have stamped on the stock path), + // wrap them in `SparkWrite$TaskCommit` via reflection (the constructor is package- + // private), Java-serialize the message so the outer `IcebergCommitExec` consumes it + // identically to the JVM-path output. Empty manifests (zero data files written) are + // still committed; the surrounding `BatchWrite.commit(messages)` handles the no-op case + // correctly. + val manifestBytes = drainAvroPayload(batches) + val proj = UnsafeProjection.create(schemaTypes) + val dataFiles = + if (manifestBytes.isEmpty) new java.util.ArrayList[AnyRef]() + else IcebergReflection.decodeManifestToDataFiles(manifestBytes, specId) + IcebergReflection.stampSortOrderId(dataFiles, sortOrderId) + val (rowSum, byteSum) = IcebergReflection.sumDataFileMetrics(dataFiles) + filesMetric.add(dataFiles.size().toLong) + rowsMetric.add(rowSum) + bytesMetric.add(byteSum) + // Mirror iceberg-java's `SparkWrite$TaskCommit.reportOutputMetrics()`: surface the same + // bytes/records counters via Spark's per-task `OutputMetrics` so the Spark UI "Output" + // column and `SparkListenerTaskEnd` payloads match the JVM-path writer. + Option(TaskContext.get()).foreach { tc => + val outputMetrics = tc.taskMetrics().outputMetrics + outputMetrics.setBytesWritten(byteSum) + outputMetrics.setRecordsWritten(rowSum) + } + val taskCommit = IcebergReflection.buildTaskCommit(dataFiles) + val serialized = + IcebergWriteExec.serializeMessage(taskCommit.asInstanceOf[WriterCommitMessage]) + Iterator.single(proj(InternalRow(serialized)).copy()) + } + } + + override def doExecuteColumnar(): RDD[ColumnarBatch] = { + val childRDD = if (child.supportsColumnar) { + child.executeColumnar() + } else { + throw new UnsupportedOperationException( + "CometIcebergWriteExec requires a columnar (Comet native) child; got " + + child.getClass.getName) + } + + val numPartitions = childRDD.getNumPartitions + // Native side emits a single Binary column carrying the per-task Avro `DataFile` blob. + val numOutputCols = 1 + val capturedNativeOp = nativeOp + + childRDD.mapPartitionsInternal { iter => + val partitionId = TaskContext.getPartitionId() + val taskAttemptId = TaskContext.get().taskAttemptId() + + val nativeMetrics = CometMetricNode.fromCometPlan(this) + Option(TaskContext.get()).foreach(nativeMetrics.reportNativeWriteOutputMetrics) + + val taskNativeOp = { + val icebergWrite = capturedNativeOp.getIcebergWrite.toBuilder + .setPartitionId(partitionId) + .setTaskAttemptId(taskAttemptId) + .build() + capturedNativeOp.toBuilder.setIcebergWrite(icebergWrite).build() + } + + val size = taskNativeOp.getSerializedSize + val planBytes = new Array[Byte](size) + val codedOutput = CodedOutputStream.newInstance(planBytes) + taskNativeOp.writeTo(codedOutput) + codedOutput.checkNoSpaceLeft() + + val execIterator = new CometExecIterator( + CometExec.newIterId, + Seq(iter), + numOutputCols, + planBytes, + nativeMetrics, + numPartitions, + partitionId, + None, + Seq.empty) + + execIterator + } + } + + /** + * Drain a partition's columnar output. `iceberg_write.rs` always emits exactly one batch with + * one `BINARY` row carrying the per-task V2 data manifest (possibly empty bytes when no data + * files were written). The asserts guard against a future native-side change. + */ + private def drainAvroPayload(batches: Iterator[ColumnarBatch]): Array[Byte] = { + val batch = batches.next() + val payload = + try { + require( + batch.numRows() == 1, + s"iceberg_write expected exactly 1 row per task, got ${batch.numRows()}") + batch.column(0).getBinary(0) + } finally { + batch.close() + } + require(!batches.hasNext, "iceberg_write produced more than one batch for this task") + payload + } +} diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala index b3b89d043a..c7e5827869 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala @@ -95,10 +95,13 @@ case class CometMetricNode(metrics: Map[String, SQLMetric], children: Seq[CometM */ def reportNativeWriteOutputMetrics(ctx: TaskContext): Unit = { ctx.addTaskCompletionListener[Unit] { _ => - metrics.get("bytes_written").foreach { m => + // V1 parquet writes still use Comet's internal naming (`bytes_written` / `rows_written`); + // V2 Iceberg writes mirror Spark's stock V2 names (`numOutputBytes` / `numOutputRows`). + // Fall back to the V1 names so this helper covers both. + metrics.get("numOutputBytes").orElse(metrics.get("bytes_written")).foreach { m => ctx.taskMetrics().outputMetrics.setBytesWritten(m.value) } - metrics.get("rows_written").foreach { m => + metrics.get("numOutputRows").orElse(metrics.get("rows_written")).foreach { m => ctx.taskMetrics().outputMetrics.setRecordsWritten(m.value) } } @@ -321,10 +324,21 @@ object CometMetricNode { /** * Creates a [[CometMetricNode]] from a [[CometPlan]]. + * + * Stops walking at non-Comet nodes: a JVM-side `AQEShuffleReadExec` (or any other Spark exec + * constructed off the planning thread) captures `SparkPlan.session` eagerly as a `@transient + * final val`, which can be `null` if `SparkSession.getActiveSession` returned `None` at the + * moment AQE's stage-finalisation rules built it. Forcing such a node's `metrics` lazy val NPEs + * in `SQLMetrics.createMetric(sparkContext, ...)`. We don't own those metrics anyway -- the + * native side only sources updates against operators it actually planned, and JVM-side + * AQE-stage nodes belong to a different stage that's already been materialised independently. */ - def fromCometPlan(cometPlan: SparkPlan): CometMetricNode = { - val children = cometPlan.children.map(fromCometPlan) - CometMetricNode(cometPlan.metrics, children) + def fromCometPlan(cometPlan: SparkPlan): CometMetricNode = cometPlan match { + case _: CometPlan => + val children = cometPlan.children.map(fromCometPlan) + CometMetricNode(cometPlan.metrics, children) + case _ => + CometMetricNode(Map.empty, Nil) } /** diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/IcebergCommitExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/IcebergCommitExec.scala new file mode 100644 index 0000000000..d743bed7bb --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/IcebergCommitExec.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.write.{BatchWrite, WriterCommitMessage} +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.datasources.v2.V2CommandExec +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} + +/** + * Driver-side committer for Comet's split-operator Iceberg V2 write -- the "committer" half + * described in `IcebergWriteStrategy`. Collects per-task [[WriterCommitMessage]]s from the paired + * [[IcebergWriteExec]] and hands them to `BatchWrite.commit`, then refreshes Spark's cache for + * the table. + */ +case class IcebergCommitExec( + // `run()` is invoked driver-side (V2CommandExec contract), so neither field needs to cross + // the JVM boundary. + @transient batchWrite: BatchWrite, + @transient refreshCache: () => Unit, + child: SparkPlan) + extends V2CommandExec + with UnaryExecNode + with Logging { + + override def output: Seq[Attribute] = Nil + + override lazy val metrics: Map[String, SQLMetric] = Map( + "numCommittedMessages" -> SQLMetrics + .createMetric(sparkContext, "number of task commit messages")) + + override protected def run(): Seq[InternalRow] = { + val messages: Array[WriterCommitMessage] = child.executeCollect().map { row => + IcebergWriteExec.deserializeMessage(row.getBinary(0)) + } + longMetric("numCommittedMessages").add(messages.length) + + try { + messages.foreach(batchWrite.onDataWriterCommit) + batchWrite.commit(messages) + logInfo(s"Iceberg commit succeeded with ${messages.length} task message(s)") + } catch { + case cause: Throwable => + logError(s"Iceberg commit failed; aborting ${messages.length} task message(s)", cause) + try batchWrite.abort(messages) + catch { + case abortFailure: Throwable => + cause.addSuppressed(abortFailure) + } + throw cause + } + + refreshCache() + Nil + } + + override protected def withNewChildInternal(newChild: SparkPlan): IcebergCommitExec = + copy(child = newChild) + + override def nodeName: String = "IcebergCommit" +} diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/IcebergWriteExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/IcebergWriteExec.scala new file mode 100644 index 0000000000..9f9024bdd4 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/IcebergWriteExec.scala @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} + +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection +import org.apache.spark.sql.catalyst.plans.physical.{Distribution, UnspecifiedDistribution} +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, PhysicalWriteInfoImpl, WriterCommitMessage} +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.types.{BinaryType, StructField, StructType} +import org.apache.spark.util.Utils + +import org.apache.comet.iceberg.ReplaceDataDispatchInfo + +/** + * Executor-side file writer for Comet's split-operator Iceberg V2 write -- the "file writer exec" + * half described in `IcebergWriteStrategy`. Per task, opens a [[DataWriter]] from the connector's + * factory, drains the upstream RDD, and emits a single row containing the Java-serialised + * [[WriterCommitMessage]] for the paired [[IcebergCommitExec]] to consume. + */ +case class IcebergWriteExec( + // `doExecute()` runs driver-side and turns `batchWrite` into a serialisable + // `DataWriterFactory`; only the factory ships to executors via closure capture, so + // `batchWrite` itself never crosses. + @transient batchWrite: BatchWrite, + child: SparkPlan, + replaceDataDispatch: Option[ReplaceDataDispatchInfo] = None) + extends UnaryExecNode { + + override def output: Seq[Attribute] = Seq( + AttributeReference(IcebergWriteExec.CommitMessageColumn, BinaryType, nullable = false)()) + + // `DistributionAndOrderingUtils` already injected the shuffles/sorts the write needed above the + // original logical-write node; re-requesting them here would force a second shuffle. + override def requiredChildDistribution: Seq[Distribution] = Seq(UnspecifiedDistribution) + + override lazy val metrics: Map[String, SQLMetric] = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + + override protected def doExecute(): RDD[InternalRow] = { + val rdd = child.execute() + val factory = batchWrite.createBatchWriterFactory(PhysicalWriteInfoImpl(rdd.getNumPartitions)) + // None of Iceberg's `BatchWrite` impls return true here -- guard catches a future change + // rather than silently producing duplicate files. + require( + !batchWrite.useCommitCoordinator(), + "Comet's Iceberg write path does not currently support BatchWrite implementations that " + + "require Spark's commit coordinator; received: " + batchWrite.getClass.getName) + + val rowsMetric = longMetric("numOutputRows") + val schemaTypes = output.map(_.dataType).toArray + val capturedReplaceDataDispatch = replaceDataDispatch + rdd.mapPartitionsInternal { iter => + val partId = TaskContext.getPartitionId() + val taskId = TaskContext.get().taskAttemptId() + val writer = factory.createWriter(partId, taskId) + // Driver-side `executeCollect` casts each emitted row to `UnsafeRow`; project through + // `UnsafeProjection` to satisfy that contract. + val projection = UnsafeProjection.create(schemaTypes) + IcebergWriteExec.runWriter( + writer, + iter, + rowsMetric, + projection, + capturedReplaceDataDispatch) + } + } + + override protected def withNewChildInternal(newChild: SparkPlan): IcebergWriteExec = + copy(child = newChild) + + override def nodeName: String = "IcebergWrite" +} + +object IcebergWriteExec { + + /** Name of the single Binary column carrying serialized commit messages. */ + val CommitMessageColumn: String = "iceberg_commit_message" + + /** Schema of [[IcebergWriteExec]] output. */ + val OutputSchema: StructType = StructType( + Seq(StructField(CommitMessageColumn, BinaryType, nullable = false))) + + /** + * Per-task write loop. Drains `iter` into `writer`, commits locally, and returns a single-row + * iterator carrying the serialised [[WriterCommitMessage]]. On any exception, calls + * `writer.abort()` to release task-local resources -- staged files become orphans, swept up by + * `RemoveOrphanFiles`. + */ + def runWriter( + writer: DataWriter[InternalRow], + iter: Iterator[InternalRow], + rowsMetric: SQLMetric, + projection: UnsafeProjection, + replaceDataDispatch: Option[ReplaceDataDispatchInfo]): Iterator[InternalRow] = { + val message = Utils.tryWithSafeFinallyAndFailureCallbacks(block = { + if (replaceDataDispatch.isDefined) { + runReplaceDataWriter(writer, iter, replaceDataDispatch.get, rowsMetric) + } else { + while (iter.hasNext) { + writer.write(iter.next()) + rowsMetric.add(1L) + } + } + writer.commit() + })( + catchBlock = { + writer.abort() + }, + finallyBlock = { + writer.close() + }) + + Iterator.single(projection(InternalRow(serializeMessage(message))).copy()) + } + + // Mirrors org.apache.spark.sql.catalyst.util.RowDeltaUtils. Inlined because that object is + // `private[spark]` -- we can't import it. The values 5 / 6 are verified against the shipped + // spark-catalyst 4.0.0, 4.1.1, and 4.2.0-preview4 jars (`RowDeltaUtils$` static initializer: + // WRITE_OPERATION=5, WRITE_WITH_METADATA_OPERATION=6), and the shipped `ReplaceData` writing + // tasks dispatch on exactly these codes. + // + // TODO(spark-trunk-rename): Spark `main` (post-4.2.0-preview4) renames these to + // COPY_OPERATION(5)/UPDATE_OPERATION(2)/INSERT_OPERATION(3) and re-dispatches the writing tasks, + // dropping WRITE_WITH_METADATA. When Comet starts targeting a Spark release carrying that rename, + // these constants and `runReplaceDataWriter`'s dispatch must be revisited. The CoW UPDATE / MERGE + // cases in `CometIcebergWriteActionSuite` are the regression that will fail first -- start there. + // + // WRITE_OPERATION (5) and WRITE_WITH_METADATA_OPERATION (6) are Spark 4.x-only -- emitted by + // `ReplaceData`'s rewritten row stream alongside `ReplaceDataProjections`. + private val WRITE_OPERATION = 5 + private val WRITE_WITH_METADATA_OPERATION = 6 + + // Spark 4.x added a two-arg `DataWriter#write(T metadata, T row)`. 3.4 / 3.5's `DataWriter` + // only has the single-arg `write(T row)`. Reflected once per JVM and reused by the + // WRITE_WITH_METADATA dispatch. + @transient private lazy val dataWriterWriteWithMetadataMethod + : Option[java.lang.reflect.Method] = + try Some(classOf[DataWriter[_]].getMethod("write", classOf[Object], classOf[Object])) + catch { case _: NoSuchMethodException => None } + + def serializeMessage(message: WriterCommitMessage): Array[Byte] = { + val bos = new ByteArrayOutputStream() + val oos = new ObjectOutputStream(bos) + try oos.writeObject(message) + finally oos.close() + bos.toByteArray + } + + def deserializeMessage(bytes: Array[Byte]): WriterCommitMessage = { + val bis = new ByteArrayInputStream(bytes) + val ois = new ObjectInputStream(bis) + try ois.readObject().asInstanceOf[WriterCommitMessage] + finally ois.close() + } + + /** + * Spark 4.x copy-on-write row dispatch. The reflective `write(metadata, row)` lookup falls back + * to a clear error on 3.x, since `WRITE_WITH_METADATA_OPERATION` is 4.x-only. + */ + private def runReplaceDataWriter( + writer: DataWriter[InternalRow], + iter: Iterator[InternalRow], + dispatch: ReplaceDataDispatchInfo, + rowsMetric: SQLMetric): Unit = { + val rowProjection = dispatch.rowProjection + val metadataProjection = dispatch.metadataProjection.orNull + while (iter.hasNext) { + val row = iter.next() + rowsMetric.add(1L) + row.getInt(0) match { + case WRITE_OPERATION => + rowProjection.project(row) + writer.write(rowProjection) + case WRITE_WITH_METADATA_OPERATION => + rowProjection.project(row) + if (metadataProjection != null) metadataProjection.project(row) + val writeWithMetadata = dataWriterWriteWithMetadataMethod.getOrElse( + throw new UnsupportedOperationException( + "DataWriter.write(metadata, row) is not available in this Spark version but the " + + s"analyzer emitted operation code $WRITE_WITH_METADATA_OPERATION")) + writeWithMetadata.invoke(writer, metadataProjection, rowProjection) + case other => + throw new IllegalArgumentException( + s"Unexpected ReplaceData operation code $other; supported: " + + s"$WRITE_OPERATION (WRITE), $WRITE_WITH_METADATA_OPERATION (WRITE_WITH_METADATA)") + } + } + } +} diff --git a/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala b/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala new file mode 100644 index 0000000000..e68cdb69f7 --- /dev/null +++ b/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +/** + * Spark 3.4 / 3.5 -- a single `org.apache.spark.sql.SparkSession` is what `CacheManager` methods + * accept, so the call is a direct passthrough. + */ +private[iceberg] object IcebergRefreshCacheShim { + def recacheByPlan(plan: LogicalPlan): Unit = { + val session = SparkSession.active + session.sharedState.cacheManager.recacheByPlan(session, plan) + } +} diff --git a/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala b/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala new file mode 100644 index 0000000000..8f4ddb7ff0 --- /dev/null +++ b/spark/src/main/spark-3.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.catalyst.plans.logical.ReplaceData + +/** + * Spark 3.4 / 3.5: `ReplaceData` has no `projections` field -- the rewritten row stream contains + * post-projection data directly, no operation column. Returning `None` keeps the writer's plain + * `writer.write(row)` loop in play. + */ +private[iceberg] object IcebergReplaceDataShim { + def extractProjections(rd: ReplaceData): Option[ReplaceDataDispatchInfo] = None +} diff --git a/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala b/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala new file mode 100644 index 0000000000..abe9c40856 --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergRefreshCacheShim.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.classic.SparkSession + +/** + * Spark 4.x -- `org.apache.spark.sql.SparkSession` is the api-module interface (no `sharedState` + * for use with `CacheManager`), so we resolve the active classic session, whose `sharedState` + * exposes the concrete `CacheManager` whose `recacheByPlan` expects classic. + */ +private[iceberg] object IcebergRefreshCacheShim { + def recacheByPlan(plan: LogicalPlan): Unit = { + val session = SparkSession.active + session.sharedState.cacheManager.recacheByPlan(session, plan) + } +} diff --git a/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala b/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala new file mode 100644 index 0000000000..c860580005 --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/comet/iceberg/IcebergReplaceDataShim.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.iceberg + +import org.apache.spark.sql.catalyst.plans.logical.ReplaceData + +/** + * Spark 4.x: `ReplaceData` carries `projections: ReplaceDataProjections` -- the rewritten row + * stream is prefixed with an operation column (5=WRITE, 6=WRITE_WITH_METADATA), and we have to + * apply `dataProj` / `metadataProj` before handing rows to the underlying `DataWriter`. + */ +private[iceberg] object IcebergReplaceDataShim { + def extractProjections(rd: ReplaceData): Option[ReplaceDataDispatchInfo] = { + val p = rd.projections + Some(ReplaceDataDispatchInfo(p.rowProjection, p.metadataProjection)) + } +} diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzIcebergBase.scala b/spark/src/test/scala/org/apache/comet/CometFuzzIcebergBase.scala index 92f36f534b..f15d57e250 100644 --- a/spark/src/test/scala/org/apache/comet/CometFuzzIcebergBase.scala +++ b/spark/src/test/scala/org/apache/comet/CometFuzzIcebergBase.scala @@ -42,13 +42,20 @@ class CometFuzzIcebergBase extends CometTestBase with AdaptiveSparkPlanHelper { var warehouseDir: File = null val icebergTableName: String = "hadoop_catalog.db.fuzz_test" - // Skip these tests if Iceberg is not available in classpath + // Skip when Iceberg is unavailable OR unusable on this Spark version. There is no published + // Iceberg Spark 4.1+ runtime yet (apache/iceberg#15238); the spark-4.1 / spark-4.2 profiles pin + // the binary-incompatible 4.0 runtime as a build-only stopgap, so gate on Spark version to skip + // rather than abort. Remove once the pom adopts iceberg-spark-runtime-4.1 (Iceberg 1.11.0+). private def icebergAvailable: Boolean = { - try { - Class.forName("org.apache.iceberg.catalog.Catalog") - true - } catch { - case _: ClassNotFoundException => false + if (CometSparkSessionExtensions.isSpark41Plus) { + false + } else { + try { + Class.forName("org.apache.iceberg.catalog.Catalog") + true + } catch { + case _: ClassNotFoundException => false + } } } diff --git a/spark/src/test/scala/org/apache/comet/CometIcebergRewriteActionSuite.scala b/spark/src/test/scala/org/apache/comet/CometIcebergRewriteActionSuite.scala index 53454d0034..10d3485589 100644 --- a/spark/src/test/scala/org/apache/comet/CometIcebergRewriteActionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometIcebergRewriteActionSuite.scala @@ -23,6 +23,9 @@ import java.io.File import scala.collection.mutable +import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetFileReader +import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.spark.CometListenerBusUtils import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.{CometTestBase, SparkSession} @@ -47,7 +50,10 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest RewriteCase( table = s"$catalog.db.binpack_test", configureMode = invoke(_, "binPack"), - verifyPlans = rewritePlans => assertReadsAreComet(rewritePlans))) + verifyPlans = { rewritePlans => + assertReadsAreComet(rewritePlans) + assertWritesAreComet(rewritePlans) + })) } test("sort rewrite runs scan, exchange, and sort natively in Comet") { @@ -59,6 +65,7 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest verifyDataAfter = assertSortedById, verifyPlans = { rewritePlans => assertReadsAreComet(rewritePlans) + assertWritesAreComet(rewritePlans) assertOperator(rewritePlans, "CometExchange") assertOperator(rewritePlans, "CometSort") })) @@ -77,6 +84,7 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest verifyDataAfter = assertSortedById, verifyPlans = { rewritePlans => assertReadsAreComet(rewritePlans) + assertWritesAreComet(rewritePlans) assertOperator(rewritePlans, "CometExchange") assertOperator(rewritePlans, "CometSort") })) @@ -204,9 +212,11 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest s"Expected >= 1 input file rewritten, got $rewrittenCount") } - val rewritePlans = plans.filter(_.hasNode("AppendData")) + val rewritePlans = plans.filter(_.isRewriteWrite) assert(rewritePlans.nonEmpty, "Expected at least one rewrite plan") assertReadsAreComet(rewritePlans) + assertWritesAreComet(rewritePlans) + assertCurrentDataFilesWrittenByCometNative(table) val rowsAfter = spark.sql(s"SELECT * FROM $table ORDER BY id").collect().toSeq assert( @@ -266,12 +276,13 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest assertDataPreserved(rowsBefore, rowsAfterById, filesBefore, filesAfter) rc.verifyDataAfter(rowsAfterFileOrder) - val rewritePlans = plans.filter(_.hasNode("AppendData")) + val rewritePlans = plans.filter(_.isRewriteWrite) assert( rewritePlans.nonEmpty, - "Expected at least one captured plan with AppendData but got none.\n" + - dumpPlans(plans)) + "Expected at least one captured rewrite plan (AppendData or IcebergCommit) but got " + + "none.\n" + dumpPlans(plans)) rc.verifyPlans(rewritePlans) + assertCurrentDataFilesWrittenByCometNative(rc.table) } catch { case _: ClassNotFoundException => cancel("Iceberg Actions API not available - requires iceberg-spark-runtime") @@ -294,6 +305,12 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest private case class CapturedPlan(physicalDescription: String, nodeNames: Set[String]) { def hasNode(name: String): Boolean = nodeNames.contains(name) def containsCometOperator: Boolean = nodeNames.exists(_.startsWith("Comet")) + + /** + * A rewrite write-side plan, on either the stock (`AppendData`) or split-op (`IcebergCommit`) + * shape. + */ + def isRewriteWrite: Boolean = hasNode("AppendData") || hasNode("IcebergCommit") } private def captureSqlPlans(body: => Unit): Seq[CapturedPlan] = { @@ -339,6 +356,42 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest dumpPlans(plans)) } + /** + * The rewrite action writes via `df.writeTo(...).append()`, which `IcebergWriteStrategy` routes + * through the two-op plan; `CometIcebergNativeWrite`'s serde then upgrades the inner writer to + * `CometIcebergWrite` when the upstream plan is fully Comet-native. + */ + private def assertWritesAreComet(plans: Seq[CapturedPlan]): Unit = { + assertOperator(plans, "CometIcebergWrite") + } + + /** + * Runtime check that the iceberg-rust writer actually produced the current snapshot's data + * files (not just that `CometIcebergWriteExec` was planned). `CometIcebergNativeWrite` stamps + * `Apache Iceberg (Comet)` into the parquet footer's `created_by` field; the JVM path + * omits the `(Comet)` suffix. We read each file's footer and pin that signature. + */ + private def assertCurrentDataFilesWrittenByCometNative(table: String): Unit = { + val paths = spark + .sql(s"SELECT file_path FROM $table.data_files") + .collect() + .map(_.getString(0)) + .toSeq + assert(paths.nonEmpty, s"Expected $table to have >= 1 data file in the current snapshot") + val conf = spark.sparkContext.hadoopConfiguration + paths.foreach { path => + val hadoopPath = new Path(path.stripPrefix("file:")) + val reader = ParquetFileReader.open(HadoopInputFile.fromPath(hadoopPath, conf)) + val createdBy = + try reader.getFooter.getFileMetaData.getCreatedBy + finally reader.close() + assert( + createdBy != null && createdBy.contains("(Comet)"), + s"Expected $path to be written by Comet's native iceberg-rust writer " + + s"(`(Comet)` in parquet `created_by`); got '$createdBy'") + } + } + private def assertOperator(plans: Seq[CapturedPlan], operator: String): Unit = { val matching = plans.count(_.hasNode(operator)) assert( @@ -417,6 +470,8 @@ class CometIcebergRewriteActionSuite extends CometTestBase with CometIcebergTest CometConf.COMET_ENABLED.key -> "true", CometConf.COMET_EXEC_ENABLED.key -> "true", CometConf.COMET_ICEBERG_NATIVE_ENABLED.key -> "true", + CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "true", + CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED.key -> "true", CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true")(body) /** Creates an Iceberg table with `numFiles` separate appends, each producing one data file. */ diff --git a/spark/src/test/scala/org/apache/comet/CometIcebergTestBase.scala b/spark/src/test/scala/org/apache/comet/CometIcebergTestBase.scala index eac78442b9..9ea6215a56 100644 --- a/spark/src/test/scala/org/apache/comet/CometIcebergTestBase.scala +++ b/spark/src/test/scala/org/apache/comet/CometIcebergTestBase.scala @@ -22,6 +22,7 @@ package org.apache.comet import java.io.File import java.nio.file.Files +import org.apache.comet.CometSparkSessionExtensions.isSpark41Plus import org.apache.comet.iceberg.IcebergReflection /** @@ -29,13 +30,23 @@ import org.apache.comet.iceberg.IcebergReflection */ trait CometIcebergTestBase { + /** + * True only when Iceberg is on the classpath AND usable on the running Spark version. Apache + * Iceberg does not yet publish a Spark 4.1+ runtime (tracked in apache/iceberg#15238); Comet's + * spark-4.1 / spark-4.2 profiles pin `iceberg-spark-runtime-4.0` as a build-only stopgap, which + * is binary-incompatible with Spark 4.1's Catalyst (e.g. `trees.Origin`) and aborts the moment + * an Iceberg SQL-extensions statement is parsed. Gate on Spark version so these suites skip + * cleanly instead of aborting. Remove the version check once the pom adopts a real + * `iceberg-spark-runtime-4.1` (Iceberg 1.11.0+). + */ protected def icebergAvailable: Boolean = - try { - IcebergReflection.loadClass("org.apache.iceberg.catalog.Catalog") - true - } catch { - case _: ClassNotFoundException => false - } + !isSpark41Plus && + (try { + IcebergReflection.loadClass("org.apache.iceberg.catalog.Catalog") + true + } catch { + case _: ClassNotFoundException => false + }) protected def withTempIcebergDir(f: File => Unit): Unit = { val dir = Files.createTempDirectory("comet-iceberg-test").toFile diff --git a/spark/src/test/scala/org/apache/comet/CometIcebergWriteActionSuite.scala b/spark/src/test/scala/org/apache/comet/CometIcebergWriteActionSuite.scala new file mode 100644 index 0000000000..a69bc84886 --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/CometIcebergWriteActionSuite.scala @@ -0,0 +1,787 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import java.io.File + +import scala.collection.mutable + +import org.apache.spark.{CometListenerBusUtils, SparkConf} +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.Row +import org.apache.spark.sql.comet.{CometIcebergWriteExec, IcebergCommitExec, IcebergWriteExec} +import org.apache.spark.sql.execution.{QueryExecution, SparkPlan} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.util.QueryExecutionListener + +/** + * State sampled around each test action to check commit-once invariants. `snapshotDelta` is the + * change in the Iceberg table's snapshot history during `action`; for a single Comet-Iceberg + * commit it should be exactly `1` (one logical write = one Iceberg snapshot). + */ +private case class WriteSnapshot(snapshotDelta: Long, plans: Seq[SparkPlan]) + +/** + * End-to-end coverage for Comet's two-op Iceberg V2 write path. Every test runs against a fresh + * temporary Hadoop catalog (warehouse directory deleted at the end of `withTempIcebergDir`), + * drives the write through Spark SQL, then asserts: + * - the captured executed plan contains the expected `IcebergCommitExec` + `IcebergWriteExec` + * pair (or, for fallback scenarios, does NOT contain them); + * - the committed data is queryable with the expected row set. + * + * Variant coverage mirrors Iceberg's copy-on-write V2 logical-write surface: + * - `AppendData` (INSERT INTO) -- unpartitioned, partitioned, empty result, INSERT FROM SELECT. + * - `OverwriteByExpression` (INSERT OVERWRITE static partition mode). + * - `OverwritePartitionsDynamic` (INSERT OVERWRITE dynamic partition mode). + * - `ReplaceData` (copy-on-write DELETE/UPDATE/MERGE). + * + * `WriteDelta` (merge-on-read DML) is intentionally not intercepted; it stays on Spark's default + * path. Plus negative cases: config disabled => Spark's default path. + */ +class CometIcebergWriteActionSuite + extends CometTestBase + with AdaptiveSparkPlanHelper + with CometIcebergTestBase { + + override protected def sparkConf: SparkConf = { + // Mirror the documented Comet + Iceberg setup in `docs/source/user-guide/latest/iceberg.md`: + // `IcebergSparkSessionExtensions` provides the analyzer rules that lower UPDATE / MERGE on + // V2 tables into `ReplaceData` -- without it, Spark 3.4's stock + // `SparkStrategies$BasicOperators` throws `"UPDATE TABLE is not supported temporarily"` for + // V2 paths and the SQL tests cancel. + super.sparkConf + .set(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key, "true") + .set( + "spark.sql.extensions", + "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") + } + + // --- AppendData (INSERT INTO) ------------------------------------------------------------------ + + test("AppendData unpartitioned INSERT INTO routes through two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "append_unpart", partitionSpec = "") + val snapshot = captureWrite("append_unpart") { + spark.sql( + "INSERT INTO cat.db.append_unpart VALUES " + + "(1, 'us-east', 10.5), (2, 'us-west', 20.3), (3, 'eu', 30.7)") + } + assertExactlyOneCommit(snapshot) + assertRows("append_unpart", expectedIds = Seq(1, 2, 3)) + } + } + + test("AppendData partitioned INSERT INTO routes through two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "append_part", partitionSpec = "PARTITIONED BY (region)") + val snapshot = captureWrite("append_part") { + spark.sql( + "INSERT INTO cat.db.append_part VALUES " + + "(1, 'us-east', 10.5), (2, 'us-east', 20.3), (3, 'eu', 30.7)") + } + assertExactlyOneCommit(snapshot) + assertRows("append_part", expectedIds = Seq(1, 2, 3)) + } + } + + test("AppendData INSERT FROM SELECT survives the intervening exchange/sort") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "src", partitionSpec = "") + createTable(warehouseDir, "append_from_select", partitionSpec = "PARTITIONED BY (region)") + spark.sql( + "INSERT INTO cat.db.src VALUES " + + "(1, 'us-east', 10.5), (2, 'us-west', 20.3), (3, 'eu', 30.7)") + + val snapshot = captureWrite("append_from_select") { + spark.sql( + "INSERT INTO cat.db.append_from_select " + + "SELECT id, region, amount FROM cat.db.src ORDER BY id") + } + // The capture sees the SELECT (driven by spark.read) AND the INSERT. We only care that + // *some* captured plan has the two-op pair, since the SELECT plan won't. + assertExactlyOneCommit(snapshot) + assertRows("append_from_select", expectedIds = Seq(1, 2, 3)) + } + } + + test("AppendData on an empty source still emits a single commit") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "empty_target", partitionSpec = "") + val snapshot = captureWrite("empty_target") { + spark.sql( + "INSERT INTO cat.db.empty_target SELECT id, region, amount " + + "FROM (SELECT 1 AS id, 'r' AS region, 1.0 AS amount) WHERE id < 0") + } + assertExactlyOneCommit(snapshot) + assertRows("empty_target", expectedIds = Seq.empty) + } + } + + // --- OverwriteByExpression (INSERT OVERWRITE static) ------------------------------------------- + + test("OverwriteByExpression replaces existing rows via two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "overwrite_static", partitionSpec = "") + spark.sql( + "INSERT INTO cat.db.overwrite_static VALUES " + + "(1, 'old', 1.0), (2, 'old', 2.0), (3, 'old', 3.0)") + + val snapshot = captureWrite("overwrite_static") { + withSQLConf("spark.sql.sources.partitionOverwriteMode" -> "STATIC") { + spark.sql( + "INSERT OVERWRITE cat.db.overwrite_static VALUES " + + "(10, 'new', 100.0), (11, 'new', 110.0)") + } + } + assertExactlyOneCommit(snapshot) + assertRows("overwrite_static", expectedIds = Seq(10, 11)) + } + } + + // --- OverwritePartitionsDynamic (INSERT OVERWRITE dynamic) ------------------------------------- + + test("OverwritePartitionsDynamic replaces only touched partitions") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "overwrite_dynamic", partitionSpec = "PARTITIONED BY (region)") + spark.sql( + "INSERT INTO cat.db.overwrite_dynamic VALUES " + + "(1, 'us-east', 1.0), (2, 'us-west', 2.0), (3, 'eu', 3.0)") + + val snapshot = captureWrite("overwrite_dynamic") { + withSQLConf("spark.sql.sources.partitionOverwriteMode" -> "DYNAMIC") { + // Overwrite only the 'us-east' partition; 'us-west' and 'eu' should survive. + spark.sql("INSERT OVERWRITE cat.db.overwrite_dynamic VALUES (10, 'us-east', 100.0)") + } + } + assertExactlyOneCommit(snapshot) + // Order changes after dynamic overwrite; assert membership rather than positional rows. + val ids = spark + .sql("SELECT id FROM cat.db.overwrite_dynamic ORDER BY id") + .collect() + .map(_.getInt(0)) + .toSeq + assert(ids == Seq(2, 3, 10), s"expected (2,3,10), got $ids") + } + } + + // --- ReplaceData (copy-on-write DML) ----------------------------------------------------------- + + test("ReplaceData (CoW DELETE) on a row predicate goes through two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "cow_delete", + partitionSpec = "", + // Default V2 write mode is CoW already, but be explicit for resilience to upstream defaults. + properties = Some("'write.delete.mode'='copy-on-write'")) + // Use Spark's default path for the setup INSERT to isolate the DELETE under test from any + // interaction with our two-op AppendData path. Test correctness depends on the DELETE alone. + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + coalesceInsert( + "cow_delete", + Seq((1, "us-east", 10.0), (2, "us-west", 20.0), (3, "eu", 30.0), (4, "us-east", 40.0))) + } + + val snapshot = captureWrite("cow_delete") { + spark.sql("DELETE FROM cat.db.cow_delete WHERE id = 2") + } + assertExactlyOneCommit(snapshot) + assertRows("cow_delete", expectedIds = Seq(1, 3, 4)) + } + } + + test("ReplaceData (CoW UPDATE) routes through two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "cow_update", + partitionSpec = "", + properties = Some("'write.update.mode'='copy-on-write'")) + coalesceInsert( + "cow_update", + Seq((1, "us-east", 10.0), (2, "us-west", 20.0), (3, "eu", 30.0))) + + val snapshot = captureWrite("cow_update") { + spark.sql("UPDATE cat.db.cow_update SET amount = amount * 2 WHERE id = 2") + } + assertExactlyOneCommit(snapshot) + val r = spark + .sql("SELECT id, amount FROM cat.db.cow_update WHERE id = 2") + .collect() + assert(r.length == 1 && r(0).getDouble(1) == 40.0, s"got ${r.toSeq}") + } + } + + test("ReplaceData (CoW MERGE) routes through two-op") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "cow_merge", + partitionSpec = "", + properties = Some("'write.merge.mode'='copy-on-write'")) + coalesceInsert("cow_merge", Seq((1, "us-east", 10.0), (2, "us-west", 20.0))) + + // Use a one-row source via inline VALUES so the merge has both matched + unmatched legs. + val snapshot = captureWrite("cow_merge") { + spark.sql(""" + |MERGE INTO cat.db.cow_merge t + |USING (SELECT 2 AS id, 'us-west' AS region, 200.0 AS amount UNION ALL + | SELECT 3 AS id, 'eu' AS region, 30.0 AS amount) s + |ON t.id = s.id + |WHEN MATCHED THEN UPDATE SET t.amount = s.amount + |WHEN NOT MATCHED THEN INSERT (id, region, amount) VALUES (s.id, s.region, s.amount) + |""".stripMargin) + } + assertExactlyOneCommit(snapshot) + assertRows("cow_merge", expectedIds = Seq(1, 2, 3)) + } + } + + // --- Fall-back when config is disabled --------------------------------------------------------- + + test("sanity check: Spark's default DELETE path works against a Hadoop catalog") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + createTable( + warehouseDir, + "spark_cow_delete", + partitionSpec = "", + properties = Some("'write.delete.mode'='copy-on-write'")) + coalesceInsert( + "spark_cow_delete", + Seq((1, "us-east", 10.0), (2, "us-west", 20.0), (3, "eu", 30.0), (4, "us-east", 40.0))) + spark.sql("DELETE FROM cat.db.spark_cow_delete WHERE id = 2") + assertRows("spark_cow_delete", expectedIds = Seq(1, 3, 4)) + } + } + } + + test("disabled config falls through to Spark's V2ExistingTableWriteExec") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "disabled_conf", partitionSpec = "") + + val snapshot = captureWrite("disabled_conf") { + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + spark.sql("INSERT INTO cat.db.disabled_conf VALUES (1, 'us-east', 10.5)") + } + } + // Spark's stock V2 write path still commits one snapshot to the table; we only assert + // that Comet's two-op execs do NOT appear in the captured plans. + val (commits, writes) = collectIcebergWriteOps(snapshot.plans) + assert(commits.isEmpty, s"unexpected IcebergCommitExec: $commits") + assert(writes.isEmpty, s"unexpected IcebergWriteExec: $writes") + assertRows("disabled_conf", expectedIds = Seq(1)) + } + } + + // --- Round-trip parity vs Spark default path --------------------------------------------------- + + // --- Native acceleration -------------------------------------------------------------------- + + test("native acceleration: AppendData INSERT FROM SELECT") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_source", partitionSpec = "") + createTable(warehouseDir, "native_target", partitionSpec = "") + spark.sql( + "INSERT INTO cat.db.native_source VALUES " + + "(1, 'us-east', 10.5), (2, 'us-west', 20.3), (3, 'eu', 30.7)") + assertNativeWriteEngages("native_target", Seq(1, 2, 3)) { + spark.sql( + "INSERT INTO cat.db.native_target SELECT id, region, amount FROM cat.db.native_source") + } + } + } + + test("native acceleration: AppendData unpartitioned VALUES") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_append_values", partitionSpec = "") + assertNativeWriteEngages("native_append_values", Seq(1, 2, 3)) { + spark.sql( + "INSERT INTO cat.db.native_append_values VALUES " + + "(1, 'us-east', 10.5), (2, 'us-west', 20.3), (3, 'eu', 30.7)") + } + } + } + + // Iceberg's Spark `SparkWrite$WriterFactory` never stamps the table sort order onto appended + // data files in any version Comet targets (1.5.2 / 1.8.1 / 1.10.0): committed files are + // `sort_order_id = 0` even when the table defines a sort order. The native path must match -- + // it defaults `sort_order_id` to 0 rather than propagating `table.sortOrder().orderId()`. Pin + // that against the `data_files` metadata table; before the fix the native path stamped the + // table's (non-zero) order id here. + test("native acceleration: appended files are sort_order_id=0 even for a sorted table") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_sorted", partitionSpec = "") + // WRITE ORDERED BY (provided by IcebergSparkSessionExtensions, enabled in this suite) + // bumps the table's sort order id to a non-default value (1). + spark.sql("ALTER TABLE cat.db.native_sorted WRITE ORDERED BY id") + assertNativeWriteEngages("native_sorted", Seq(1, 2, 3)) { + spark.sql( + "INSERT INTO cat.db.native_sorted VALUES " + + "(3, 'eu', 30.7), (1, 'us-east', 10.5), (2, 'us-west', 20.3)") + } + val sortOrderIds = spark + .sql("SELECT DISTINCT sort_order_id FROM cat.db.native_sorted.data_files") + .collect() + .map(_.getInt(0)) + .toSet + assert( + sortOrderIds == Set(0), + "expected every committed data file to have sort_order_id=0 (matching Iceberg-Java), " + + s"got $sortOrderIds") + } + } + + test("native acceleration: AppendData partitioned by identity") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_append_part", partitionSpec = "PARTITIONED BY (region)") + assertNativeWriteEngages("native_append_part", Seq(1, 2, 3)) { + spark.sql( + "INSERT INTO cat.db.native_append_part VALUES " + + "(1, 'us-east', 10.5), (2, 'us-east', 20.3), (3, 'eu', 30.7)") + } + } + } + + test("native acceleration: OverwriteByExpression (INSERT OVERWRITE STATIC)") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_overwrite_static", partitionSpec = "") + spark.sql( + "INSERT INTO cat.db.native_overwrite_static VALUES " + + "(1, 'old', 1.0), (2, 'old', 2.0), (3, 'old', 3.0)") + assertNativeWriteEngages("native_overwrite_static", Seq(10, 11)) { + withSQLConf("spark.sql.sources.partitionOverwriteMode" -> "STATIC") { + spark.sql( + "INSERT OVERWRITE cat.db.native_overwrite_static VALUES " + + "(10, 'new', 100.0), (11, 'new', 110.0)") + } + } + } + } + + test("native acceleration: OverwritePartitionsDynamic") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "native_overwrite_dyn", partitionSpec = "PARTITIONED BY (region)") + spark.sql( + "INSERT INTO cat.db.native_overwrite_dyn VALUES " + + "(1, 'us-east', 1.0), (2, 'us-west', 2.0), (3, 'eu', 3.0)") + assertNativeWriteEngages("native_overwrite_dyn", Seq(2, 3, 10)) { + withSQLConf("spark.sql.sources.partitionOverwriteMode" -> "DYNAMIC") { + spark.sql("INSERT OVERWRITE cat.db.native_overwrite_dyn VALUES (10, 'us-east', 100.0)") + } + } + } + } + + test("native acceleration: ReplaceData (CoW DELETE)") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "native_cow_delete", + partitionSpec = "", + properties = Some("'write.delete.mode'='copy-on-write'")) + // Seed via the JVM path so the assertion isolates native engagement to the DELETE. + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + coalesceInsert( + "native_cow_delete", + Seq((1, "us-east", 10.0), (2, "us-west", 20.0), (3, "eu", 30.0), (4, "us-east", 40.0))) + } + assertNativeWriteEngages("native_cow_delete", Seq(1, 3, 4)) { + spark.sql("DELETE FROM cat.db.native_cow_delete WHERE id = 2") + } + } + } + + test("native acceleration: ReplaceData (CoW UPDATE)") { + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "native_cow_update", + partitionSpec = "", + properties = Some("'write.update.mode'='copy-on-write'")) + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + coalesceInsert( + "native_cow_update", + Seq((1, "us-east", 10.0), (2, "us-west", 20.0), (3, "eu", 30.0))) + } + // Engage natively; expected rows are the ids 1..3 (UPDATE keeps cardinality). + assertNativeWriteEngages("native_cow_update", Seq(1, 2, 3)) { + spark.sql("UPDATE cat.db.native_cow_update SET amount = amount * 2 WHERE id = 2") + } + // Spot-check the UPDATE actually rewrote row 2 (cardinality unchanged + value flipped). + val r = + spark.sql("SELECT id, amount FROM cat.db.native_cow_update WHERE id = 2").collect() + assert(r.length == 1 && r(0).getDouble(1) == 40.0, s"got ${r.toSeq}") + } + } + + test("native acceleration: ReplaceData (CoW MERGE) falls back (MergeRowsExec not Comet)") { + // TODO(comet-merge-rows): native MERGE engagement requires a Comet equivalent of Iceberg's + // `MergeRowsExec` (the per-row dispatch operator that assigns __row_operation codes from + // MATCHED/NOT MATCHED clauses). Without it, `MergeRowsExec` stays JVM, the upstream chain + // breaks Comet-native partway, and `requiresNativeChildren=true` declines the + // `IcebergWriteExec -> CometIcebergWriteExec` conversion. Until that lands, MERGE + // falls back to the JVM two-op path -- this test pins that contract so a future MERGE-row-exec + // addition surfaces clearly (the test will start failing and need to flip back to + // `assertNativeWriteEngages`). + assumeNativeAcceleration() + withIcebergCatalog { warehouseDir => + createTable( + warehouseDir, + "native_cow_merge", + partitionSpec = "", + properties = Some("'write.merge.mode'='copy-on-write'")) + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + coalesceInsert("native_cow_merge", Seq((1, "us-east", 10.0), (2, "us-west", 20.0))) + } + assertNativeWriteDoesNotEngage("native_cow_merge", Seq(1, 2, 3)) { + spark.sql(""" + |MERGE INTO cat.db.native_cow_merge t + |USING (SELECT 2 AS id, 'us-west' AS region, 200.0 AS amount UNION ALL + | SELECT 3 AS id, 'eu' AS region, 30.0 AS amount) s + |ON t.id = s.id + |WHEN MATCHED THEN UPDATE SET t.amount = s.amount + |WHEN NOT MATCHED THEN INSERT (id, region, amount) VALUES (s.id, s.region, s.amount) + |""".stripMargin) + } + } + } + + test("native acceleration: complex types (struct, array, map) round-trip with field IDs") { + assumeNativeAcceleration() + withIcebergCatalog { _ => + // Three nested kinds in one schema so the recursive PARQUET_FIELD_ID_META_KEY decoration + // is exercised end-to-end. Reading back via Iceberg's reader is the proof point: Iceberg + // matches columns by field id, not by name, so a row that round-trips means every nested + // field id made it into the Parquet metadata. + spark.sql(s""" + CREATE TABLE $catalog.$ns.native_complex ( + id INT, + addr STRUCT, + tags ARRAY, + attrs MAP + ) USING iceberg + """) + val snapshot = withNativeEnabled { + captureWrite("native_complex") { + spark.sql(""" + INSERT INTO cat.db.native_complex VALUES + (1, named_struct('city', 'NYC', 'zip', 10001), array('a', 'b'), map('k1', 1, 'k2', 2)), + (2, named_struct('city', 'SF', 'zip', 94016), array('c'), map('k3', 3)) + """) + } + } + assert(snapshot.snapshotDelta == 1L) + val nativeExecs = snapshot.plans.flatMap { p => + collectWithSubqueries(p) { case e: CometIcebergWriteExec => e } + } + assert(nativeExecs.nonEmpty, "expected native write exec in captured plans") + val rows = spark + .sql(s"SELECT id, addr.city, addr.zip, tags, attrs FROM $catalog.$ns.native_complex" + + " ORDER BY id") + .collect() + assert(rows.length == 2) + assert(rows(0).getInt(0) == 1) + assert(rows(0).getString(1) == "NYC") + assert(rows(0).getInt(2) == 10001) + assert(rows(1).getString(1) == "SF") + assert(rows(1).getInt(2) == 94016) + } + } + + test("Comet-written rows round-trip through Spark's reader unchanged") { + assume(icebergAvailable, "Iceberg not available in classpath") + withIcebergCatalog { warehouseDir => + createTable(warehouseDir, "parity_comet", partitionSpec = "PARTITIONED BY (region)") + createTable(warehouseDir, "parity_spark", partitionSpec = "PARTITIONED BY (region)") + + spark.sql( + "INSERT INTO cat.db.parity_comet VALUES " + + "(1, 'us', 1.5), (2, 'eu', 2.5), (3, 'ap', 3.5), (4, 'us', 4.5)") + + withSQLConf(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key -> "false") { + spark.sql( + "INSERT INTO cat.db.parity_spark VALUES " + + "(1, 'us', 1.5), (2, 'eu', 2.5), (3, 'ap', 3.5), (4, 'us', 4.5)") + } + + val cometRows: Array[Row] = spark + .sql("SELECT id, region, amount FROM cat.db.parity_comet ORDER BY id") + .collect() + val sparkRows: Array[Row] = spark + .sql("SELECT id, region, amount FROM cat.db.parity_spark ORDER BY id") + .collect() + assert(cometRows.toSeq == sparkRows.toSeq, s"$cometRows vs $sparkRows") + } + } + + // --- Helpers ----------------------------------------------------------------------------------- + + private val catalog = "cat" + private val ns = "db" + + /** + * Spin up an isolated Hadoop catalog under a temp dir and run `f`. Tests share the catalog name + * within a single `withIcebergCatalog` block but get a fresh warehouse directory per outer + * call. + */ + private def withIcebergCatalog(f: File => Unit): Unit = withTempIcebergDir { warehouseDir => + withSQLConf( + s"spark.sql.catalog.$catalog" -> "org.apache.iceberg.spark.SparkCatalog", + s"spark.sql.catalog.$catalog.type" -> "hadoop", + s"spark.sql.catalog.$catalog.warehouse" -> warehouseDir.getAbsolutePath, + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_ENABLED.key -> "true") { + f(warehouseDir) + } + } + + private def createTable( + warehouseDir: File, + tableName: String, + partitionSpec: String, + properties: Option[String] = None): Unit = { + val props = properties.map(s => s" TBLPROPERTIES ($s)").getOrElse("") + spark.sql(s""" + CREATE TABLE $catalog.$ns.$tableName ( + id INT, + region STRING, + amount DOUBLE + ) USING iceberg + $partitionSpec + $props + """) + } + + /** + * INSERT a small in-memory dataset into `tableName` as a single data file. `INSERT INTO ... + * VALUES (...), (...)` can be split per VALUES row into multiple files; `coalesce(1)` keeps the + * setup consistent so DML predicates target a single file, exercising the rewrite path + * deterministically. Uses the COMET write path itself (no fallback) so the setup also serves as + * additional AppendData coverage. + */ + private def coalesceInsert(tableName: String, rows: Seq[(Int, String, Double)]): Unit = { + // `spark` is a `def` on the base class, so `spark.implicits._` is not a stable path. Capture + // a local handle first so the implicit `Encoder`s come from a stable identifier. + val session = spark + import session.implicits._ + rows + .toDF("id", "region", "amount") + .coalesce(1) + .writeTo(s"$catalog.$ns.$tableName") + .append() + } + + /** + * Count the snapshot history of `tableName` before and after `action`, capturing executed plans + * in between. The delta = number of snapshots committed during `action`; a successful + * single-write should produce exactly one new snapshot. Driven from Iceberg's + * `.snapshots` metadata table (no production-side counter needed) -- the snapshot + * history is the canonical record of commits, and using it here keeps tests honest against what + * Iceberg actually persisted. + * + * AQE finalisation can fire multiple `QueryExecutionListener.onSuccess` events for a single + * command, so we additionally accumulate every captured `executedPlan` -- callers check + * `IcebergCommitExec` / `IcebergWriteExec` presence in the union of all captures. + */ + private def captureWrite(tableName: String)(action: => Unit): WriteSnapshot = { + val before = countSnapshots(tableName) + val captured = mutable.Buffer.empty[SparkPlan] + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { + captured += qe.executedPlan + } + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = + () + } + spark.listenerManager.register(listener) + try { + action + // Drain the SparkContext listener bus so any post-action SparkListener events have a + // chance to land before we read `captured`. We don't *require* the bus to be empty (the + // QueryExecutionListener fires synchronously enough for our purposes), so a timeout here + // -- which Iceberg 1.5.2's `
.snapshots` query path can trigger on Spark 3.4 by + // queueing more SparkListener traffic than `waitUntilEmpty`'s default 10s budget -- is + // not fatal to the test. + try CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + catch { case _: java.util.concurrent.TimeoutException => () } + } finally { + spark.listenerManager.unregister(listener) + } + val after = countSnapshots(tableName) + WriteSnapshot(after - before, captured.toSeq) + } + + /** + * Read the current snapshot count from Iceberg's `
.snapshots` metadata table. Returns + * `0` if the table has no snapshots yet (e.g. an empty newly-created table) and short-circuits + * to `0` if the snapshot read throws (the catalog might not yet have indexed a brand-new table, + * in which case "no snapshots" is the right interpretation). + */ + private def countSnapshots(tableName: String): Long = + try { + spark + .sql(s"SELECT count(*) FROM $catalog.$ns.$tableName.snapshots") + .collect() + .head + .getLong(0) + } catch { + case _: Throwable => 0L + } + + private def collectIcebergWriteOps( + plans: Seq[SparkPlan]): (Seq[IcebergCommitExec], Seq[IcebergWriteExec]) = { + val commits = plans.flatMap { plan => + collectWithSubqueries(plan) { case c: IcebergCommitExec => c } + } + val writes = plans.flatMap { plan => + collectWithSubqueries(plan) { case w: IcebergWriteExec => w } + } + (commits, writes) + } + + /** + * Strongest invariant we can pin on a single Comet-Iceberg write: the Iceberg table's snapshot + * history advanced by exactly one entry, and the captured plans contain at least one + * `IcebergCommitExec` plus at least one `IcebergWriteExec`. + * + * The snapshot-delta check guards against AQE re-planning, logical-link drift, and any future + * planner-level bug that could re-emit `IcebergCommitExec` mid-execution -- each extra commit + * produces a new snapshot, so duplicates surface as a delta > 1. The plan-presence check guards + * against regressions that silently fall back to Spark's own V2 path. + * + * Plan counts >=1 because AQE finalisation can fire multiple `onSuccess` events for a single + * write, each carrying the same operator instances; exact event count isn't load-bearing. + */ + private def assertExactlyOneCommit(snapshot: WriteSnapshot): Unit = { + assert( + snapshot.snapshotDelta == 1L, + s"expected exactly 1 new Iceberg snapshot, got ${snapshot.snapshotDelta}. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + val (commits, writes) = collectIcebergWriteOps(snapshot.plans) + assert( + commits.nonEmpty, + s"expected >= 1 IcebergCommitExec in captured plans, got ${commits.size}. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + assert( + writes.nonEmpty, + s"expected >= 1 IcebergWriteExec in captured plans, got ${writes.size}. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + } + + private def assertRows(tableName: String, expectedIds: Seq[Int]): Unit = { + val ids = spark + .sql(s"SELECT id FROM $catalog.$ns.$tableName ORDER BY id") + .collect() + .map(_.getInt(0)) + .toSeq + assert(ids == expectedIds, s"expected $expectedIds, got $ids") + } + + /** Native acceleration shared assumption -- currently just the Iceberg-on-classpath check. */ + private def assumeNativeAcceleration(): Unit = { + assume(icebergAvailable, "Iceberg not available in classpath") + } + + /** + * Flip [[CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED]] for the duration of `action`. + * + * We also enable [[CometConf.COMET_EXEC_LOCAL_TABLE_SCAN_ENABLED]] (default off) so VALUES- + * driven INSERTs have a Comet-native upstream -- `requiresNativeChildren = true` would + * otherwise short-circuit the conversion to `CometIcebergWriteExec` because Spark emits a bare + * `LocalTableScanExec` for inline `VALUES`. Using `sessionState.conf.setConfString` directly + * (rather than `withSQLConf`) keeps the override visible to the columnar rule across some Spark + * version / session-state combinations where `withSQLConf` loses the override before the rule + * fires. + */ + private def withNativeEnabled[T](action: => T): T = { + val session = spark + session.sessionState.conf + .setConfString(CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED.key, "true") + session.sessionState.conf + .setConfString(CometConf.COMET_EXEC_LOCAL_TABLE_SCAN_ENABLED.key, "true") + try action + finally { + session.sessionState.conf.unsetConf(CometConf.COMET_EXEC_LOCAL_TABLE_SCAN_ENABLED.key) + session.sessionState.conf.unsetConf(CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED.key) + } + } + + /** + * Strongest invariant we can pin on a single native write: commit-count advanced by exactly one + * (same as the JVM-path assertion -- AQE re-planning never duplicates commits) AND at least one + * [[CometIcebergWriteExec]] appears in some captured plan AND the resulting row set matches. + */ + private def assertNativeWriteEngages(tableName: String, expectedIds: Seq[Int])( + action: => Unit): Unit = { + val snapshot = withNativeEnabled { captureWrite(tableName)(action) } + assert( + snapshot.snapshotDelta == 1L, + s"expected 1 commit via native path, got ${snapshot.snapshotDelta}. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + val nativeExecs = snapshot.plans.flatMap { p => + collectWithSubqueries(p) { case e: CometIcebergWriteExec => e } + } + assert( + nativeExecs.nonEmpty, + "expected >= 1 CometIcebergWriteExec in captured plans, got 0. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + assertRows(tableName, expectedIds) + } + + /** + * For writes that *should* fall back even when the native conf is on (e.g. CoW MERGE): exactly + * one commit, no `CometIcebergWriteExec`, but the JVM two-op pair still present and rows + * correct. Catches regressions where a trigger silently stops firing and we accidentally run + * native for an unsupported case. + */ + private def assertNativeWriteDoesNotEngage(tableName: String, expectedIds: Seq[Int])( + action: => Unit): Unit = { + val snapshot = withNativeEnabled { captureWrite(tableName)(action) } + assertExactlyOneCommit(snapshot) + val nativeExecs = snapshot.plans.flatMap { p => + collectWithSubqueries(p) { case e: CometIcebergWriteExec => e } + } + assert( + nativeExecs.isEmpty, + s"expected NO CometIcebergWriteExec, got ${nativeExecs.size}. Plans:\n" + + snapshot.plans.mkString("\n--\n")) + assertRows(tableName, expectedIds) + } +} diff --git a/spark/src/test/scala/org/apache/comet/CometIcebergWriteDetectionSuite.scala b/spark/src/test/scala/org/apache/comet/CometIcebergWriteDetectionSuite.scala new file mode 100644 index 0000000000..57679c7f6c --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/CometIcebergWriteDetectionSuite.scala @@ -0,0 +1,621 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import java.io.File + +import org.apache.spark.SparkConf +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.IcebergWriteExec + +import org.apache.comet.CometSparkSessionExtensions.isSpark35Plus +import org.apache.comet.iceberg.IcebergReflection +import org.apache.comet.serde.{Compatible, SupportLevel, Unsupported} +import org.apache.comet.serde.operator.CometIcebergNativeWrite + +/** + * Pins every documented fall-back trigger in [[CometIcebergNativeWrite.getSupportLevel]] without + * paying the cost of an actual write per case. Tests build an [[IcebergWriteExec]] via Spark's + * SQL planning (we only inspect `queryExecution.sparkPlan`; the command itself never fires) and + * call `getSupportLevel` directly. + * + * Companion to [[CometIcebergWriteActionSuite]]: the action suite checks end-to-end behaviour + * with a real catalog and physical writes; this suite checks the detection gate in isolation. + */ +class CometIcebergWriteDetectionSuite extends CometTestBase with CometIcebergTestBase { + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set(CometConf.COMET_ICEBERG_WRITE_SPLIT_OPERATOR_ENABLED.key, "true") + .set(CometConf.COMET_ICEBERG_NATIVE_WRITE_ENABLED.key, "true") + } + + // --- Positive baseline ----------------------------------------------------------------------- + + test("clean parquet V2 table planned as AppendData yields Compatible") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable(dir, "ok", partitionSpec = "") + assertSupportLevelIs[Compatible]("ok") + } + } + + // --- SparkWrite reflection coverage ---------------------------------------------------------- + + // Pins that every private-field / private-method accessor the native-write gate and proto + // builder depend on still resolves against a real `SparkWrite` from the Iceberg runtime on the + // build's classpath. These accessors fail closed (Option -> JVM fall-back) by design, so a field + // rename (e.g. `useFanoutWriter` <-> `partitionedFanoutEnabled` between Iceberg 1.5.2 and 1.8.1) + // would silently lose native acceleration with no other test failing. This is the explicit + // compatibility matrix the comments in IcebergReflection only describe. + test("SparkWrite reflection helpers all resolve on the current Iceberg runtime") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable(dir, "refl_probe", partitionSpec = "") + val sparkWrite = IcebergReflection + .getOuterSparkWrite(insertWriteExec("refl_probe").batchWrite) + .getOrElse(fail("could not unwrap outer SparkWrite from BatchWrite")) + val table = IcebergReflection + .getTableFromSparkWrite(sparkWrite) + .getOrElse(fail("SparkWrite.table reflection returned None")) + + assert(IcebergReflection.getOperationIdFromSparkWrite(sparkWrite).isDefined, "queryId") + assert( + IcebergReflection.getTargetFileSizeFromSparkWrite(sparkWrite).isDefined, + "targetFileSize") + assert( + IcebergReflection.getUseFanoutWriterFromSparkWrite(sparkWrite).isDefined, + "useFanoutWriter") + assert( + IcebergReflection.getOutputSpecIdFromSparkWrite(sparkWrite).isDefined, + "outputSpecId") + assert(IcebergReflection.getWriteSchemaFromSparkWrite(sparkWrite).isDefined, "writeSchema") + assert(IcebergReflection.getFormatFromSparkWrite(sparkWrite).isDefined, "format") + assert( + IcebergReflection.getWritePropertiesFromSparkWrite(sparkWrite).isDefined, + "writeProperties") + // `SparkWrite.writeConf.outputSortOrderId(...)` is intentionally NOT pinned here: it does not + // exist on any Iceberg version Comet targets (1.5.2 / 1.8.1 / 1.10.0), and the proto builder + // deliberately defaults `sort_order_id` to 0 to match Iceberg-Java (whose Spark WriterFactory + // never stamps the table sort order on appended files). Sort-order parity is covered by the + // action suite, not by reflection resolvability. + + assert(IcebergReflection.getMetadataLocation(table).isDefined, "metadataLocation") + assert(IcebergReflection.getDataLocation(table).isDefined, "dataLocation") + } + } + + // --- write.format.default -------------------------------------------------------------------- + + test("fall-back: write.format.default=orc") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "fmt_orc", + partitionSpec = "", + properties = Some("'write.format.default'='orc'")) + assertUnsupportedContains("fmt_orc", "format=orc", "only parquet") + } + } + + // Mirrors the table-property gate above but exercises the option-override branch of + // `SparkWriteConf.dataFileFormat()`: a per-write `write-format` option must win over a + // parquet-default table. The DataFrame-writer driver routes the option into `SparkWriteConf` + // the same way Iceberg-Java does at runtime, so the gate reads it from `SparkWrite.format`. + test("fall-back: per-write write-format option overrides parquet default") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable(dir, "fmt_orc_opt", partitionSpec = "") + val writeExec = dfWriteExec("fmt_orc_opt", "write-format" -> "orc") + val support = CometIcebergNativeWrite.getSupportLevel(writeExec) + support match { + case Unsupported(Some(reason)) => + assert(reason.contains("format=orc"), s"reason '$reason' missing 'format=orc'") + assert(reason.contains("only parquet"), s"reason '$reason' missing 'only parquet'") + case other => fail(s"expected Unsupported, got $other") + } + } + } + + // --- write.object-storage.enabled ------------------------------------------------------------ + + test("fall-back: write.object-storage.enabled=true") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "obj_store", + partitionSpec = "", + properties = Some("'write.object-storage.enabled'='true'")) + assertUnsupportedContains("obj_store", "write.object-storage.enabled") + } + } + + // --- write.location-provider.impl ------------------------------------------------------------ + + test("fall-back: write.location-provider.impl set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + // The trigger only checks whether the property exists, not its value -- any string trips + // it. We use a class name that isn't actually loadable: the captured `executedPlan` still + // contains the `IcebergWriteExec` we want to inspect; the writer-side load failure + // is caught and ignored by our `QueryExecutionListener`-based capture. + createTable( + dir, + "loc_provider", + partitionSpec = "", + properties = Some("'write.location-provider.impl'='com.example.MyProvider'")) + assertUnsupportedContains("loc_provider", "write.location-provider.impl") + } + } + + // --- format-version >= 3 --------------------------------------------------------------------- + + test("fall-back: format-version=3") { + assume(icebergAvailable, "Iceberg not available in classpath") + // Iceberg 1.5.2 (Spark 3.4 profile) cannot create V3 tables -- the writer rejects with + // "Cannot upgrade table to unsupported format version: 3". Gate on 3.5+ which ships with + // Iceberg >= 1.8 (V3 supported on the writer side). + assume(isSpark35Plus, "V3 tables require Iceberg 1.8.1+ (Spark 3.5 profile)") + withDetectionCatalog { dir => + createTable(dir, "v3", partitionSpec = "", properties = Some("'format-version'='3'")) + assertUnsupportedContains("v3", "format-version=3") + } + } + + // --- encryption.* ---------------------------------------------------------------------------- + + test("fall-back: encryption.kms-client-impl set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "enc", + partitionSpec = "", + properties = Some("'encryption.kms-client-impl'='com.example.MyKms'")) + assertUnsupportedContains("enc", "encryption") + } + } + + // --- write.metadata.metrics.default contains 'counts' ---------------------------------------- + + test("fall-back: write.metadata.metrics.default=counts") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "metrics_counts", + partitionSpec = "", + properties = Some("'write.metadata.metrics.default'='counts'")) + assertUnsupportedContains("metrics_counts", "write.metadata.metrics.default", "counts") + } + } + + // --- write.metadata.metrics.column.=counts ------------------------------------------------ + + test("fall-back: per-column metrics mode=counts") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "metrics_col_counts", + partitionSpec = "", + properties = Some("'write.metadata.metrics.column.id'='counts'")) + assertUnsupportedContains( + "metrics_col_counts", + "write.metadata.metrics.column.id", + "counts") + } + } + + // --- write.parquet.bloom-filter-max-bytes ---------------------------------------------------- + + test("fall-back: write.parquet.bloom-filter-max-bytes set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "bloom_max", + partitionSpec = "", + properties = Some("'write.parquet.bloom-filter-max-bytes'='524288'")) + assertUnsupportedContains("bloom_max", "write.parquet.bloom-filter-max-bytes") + } + } + + // --- write.parquet.bloom-filter-enabled.column.=true -------------------------------------- + + test("fall-back: per-column bloom filter enabled") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "bloom_col", + partitionSpec = "", + properties = Some("'write.parquet.bloom-filter-enabled.column.id'='true'")) + assertUnsupportedContains( + "bloom_col", + "write.parquet.bloom-filter-enabled.column.id", + "true") + } + } + + // --- schema field count > max-inferred-column-defaults --------------------------------------- + + test("fall-back: schema exceeds write.metadata.metrics.max-inferred-column-defaults") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { _ => + // Lower the cap to 2 so our 3-column fixture table trips it without making the schema huge. + val sql = s""" + CREATE TABLE $catalog.$ns.too_many_cols ( + id INT, + region STRING, + amount DOUBLE + ) USING iceberg + TBLPROPERTIES ('write.metadata.metrics.max-inferred-column-defaults'='2') + """ + spark.sql(sql) + assertUnsupportedContains( + "too_many_cols", + "projected field IDs which exceeds", + "write.metadata.metrics.max-inferred-column-defaults=2") + } + } + + // Iceberg only applies the inferred-column none-truncation when NO explicit + // `write.metadata.metrics.default` is set; with a default present every column uses that mode + // regardless of count, so the gate must not fire even when the schema exceeds the cap. + test("Compatible when over max-inferred cap but an explicit metrics.default is set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { _ => + val sql = s""" + CREATE TABLE $catalog.$ns.cap_with_default ( + id INT, + region STRING, + amount DOUBLE + ) USING iceberg + TBLPROPERTIES ( + 'write.metadata.metrics.max-inferred-column-defaults'='2', + 'write.metadata.metrics.default'='full' + ) + """ + spark.sql(sql) + assertSupportLevelIs[Compatible]("cap_with_default") + } + } + + // --- write.parquet.row-group-check-min-record-count != default ------------------------------- + + test("fall-back: row-group-check-min-record-count non-default") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "rg_min", + partitionSpec = "", + properties = Some("'write.parquet.row-group-check-min-record-count'='500'")) + assertUnsupportedContains("rg_min", "write.parquet.row-group-check-min-record-count=500") + } + } + + test("Compatible when row-group-check-min-record-count is at default (100)") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + // Explicitly set to the Iceberg default -- the gate must not trigger on equal-to-default + // values, only on divergent ones. + createTable( + dir, + "rg_min_default", + partitionSpec = "", + properties = Some("'write.parquet.row-group-check-min-record-count'='100'")) + assertSupportLevelIs[Compatible]("rg_min_default") + } + } + + // --- write.parquet.row-group-check-max-record-count != default ------------------------------- + + test("fall-back: row-group-check-max-record-count non-default") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "rg_max", + partitionSpec = "", + properties = Some("'write.parquet.row-group-check-max-record-count'='50000'")) + assertUnsupportedContains("rg_max", "write.parquet.row-group-check-max-record-count=50000") + } + } + + // --- write.metadata.metrics.default=none ----------------------------------------------------- + + test("fall-back: write.metadata.metrics.default=none") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "metrics_none", + partitionSpec = "", + properties = Some("'write.metadata.metrics.default'='none'")) + assertUnsupportedContains("metrics_none", "write.metadata.metrics.default", "none") + } + } + + // --- per-column metrics mode=none ------------------------------------------------------------ + + test("fall-back: per-column metrics mode=none") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "col_metrics_none", + partitionSpec = "", + properties = Some("'write.metadata.metrics.column.region'='none'")) + assertUnsupportedContains( + "col_metrics_none", + "write.metadata.metrics.column.region", + "none") + } + } + + // (No `write.parquet.page-version` gate: that property does not exist in any Iceberg version + // Comet targets -- the parquet writer is hardwired to PARQUET_1_0 -- so setting it is a no-op in + // Iceberg-Java just as in the native path. Nothing to gate.) + + // --- parquet.enable.dictionary set ----------------------------------------------------------- + + test("fall-back: parquet.enable.dictionary set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "enable_dict", + partitionSpec = "", + properties = Some("'parquet.enable.dictionary'='false'")) + assertUnsupportedContains("enable_dict", "parquet.enable.dictionary") + } + } + + // --- per-column write.parquet.stats-enabled.column. --------------------------------------- + + // `write.parquet.stats-enabled.column.*` only exists (and is honoured) on Iceberg 1.10.0+ + // (`TableProperties.PARQUET_COLUMN_STATS_ENABLED_PREFIX`). On 1.5.2 / 1.8.1 Iceberg-Java ignores + // it, so the native write matches Java and must NOT fall back. Branch the expectation on whether + // the constant exists, so the test asserts the right thing on every profile. + test("per-column write.parquet.stats-enabled.: gated only where Iceberg honours it") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "col_stats", + partitionSpec = "", + properties = Some("'write.parquet.stats-enabled.column.region'='false'")) + val honoured = + IcebergReflection + .tablePropertyConstantOpt("PARQUET_COLUMN_STATS_ENABLED_PREFIX") + .isDefined + if (honoured) { + assertUnsupportedContains("col_stats", "write.parquet.stats-enabled.column.region") + } else { + assertSupportLevelIs[Compatible]("col_stats") + } + } + } + + // --- io-impl set ----------------------------------------------------------------------------- + + test("fall-back: io-impl set") { + assume(icebergAvailable, "Iceberg not available in classpath") + withDetectionCatalog { dir => + createTable( + dir, + "io_impl", + partitionSpec = "", + properties = Some("'io-impl'='com.example.MyFileIO'")) + assertUnsupportedContains("io_impl", "io-impl") + } + } + + // --- unsupported data location URI scheme ---------------------------------------------------- + + test("fall-back: data location URI scheme not supported by the native writer") { + assume(icebergAvailable, "Iceberg not available in classpath") + // Override the table's data location via `write.data.path` so the location provider returns + // a URI whose scheme `iceberg_common::storage_factory_for` does not resolve. The actual + // path is never opened -- detection runs at planning time before any write attempt. + withDetectionCatalog { dir => + createTable( + dir, + "bad_scheme", + partitionSpec = "", + properties = Some("'write.data.path'='hdfs://nonexistent.invalid/iceberg/db/bad_scheme'")) + assertUnsupportedContains("bad_scheme", "scheme 'hdfs'", "not supported") + } + } + + // --- Helpers ----------------------------------------------------------------------------------- + + private val catalog = "cat" + private val ns = "db" + + /** + * Lighter analogue of `CometIcebergWriteActionSuite.withIcebergCatalog`: same catalog wiring, + * no Comet read/exec acceleration (we're only inspecting planning, not executing). + */ + private def withDetectionCatalog(f: File => Unit): Unit = withTempIcebergDir { warehouseDir => + withSQLConf( + s"spark.sql.catalog.$catalog" -> "org.apache.iceberg.spark.SparkCatalog", + s"spark.sql.catalog.$catalog.type" -> "hadoop", + s"spark.sql.catalog.$catalog.warehouse" -> warehouseDir.getAbsolutePath) { + f(warehouseDir) + } + } + + private def createTable( + warehouseDir: File, + tableName: String, + partitionSpec: String, + properties: Option[String] = None): Unit = { + val props = properties.map(s => s" TBLPROPERTIES ($s)").getOrElse("") + spark.sql(s""" + CREATE TABLE $catalog.$ns.$tableName ( + id INT, + region STRING, + amount DOUBLE + ) USING iceberg + $partitionSpec + $props + """) + } + + /** + * Trigger an `INSERT INTO` and pluck the `IcebergWriteExec` out of the captured `executedPlan`. + */ + private def insertWriteExec(tableName: String): IcebergWriteExec = + captureWriteExec(tableName) { + spark.sql(s"INSERT INTO $catalog.$ns.$tableName VALUES (1, 'us', 1.0)") + } + + /** + * Sibling of [[insertWriteExec]] that uses the DataFrameWriterV2 API so per-write `option(...)` + * pairs flow into `SparkWriteConf`. Pins gates whose effective value resolves through a + * `SparkWrite` field (e.g. the resolved `format` overlaying `write.format.default` with the + * per-write `write-format` option). + */ + private def dfWriteExec(tableName: String, options: (String, String)*): IcebergWriteExec = + captureWriteExec(tableName) { + val df = spark + .createDataFrame(Seq((1, "us", 1.0))) + .toDF("id", "region", "amount") + val writer = options.foldLeft(df.writeTo(s"$catalog.$ns.$tableName")) { case (w, (k, v)) => + w.option(k, v) + } + writer.append() + } + + /** + * Spark 3.5+ `QueryExecution.sparkPlan` accesses `commandExecuted`, which eagerly executes V2 + * commands. So we can't inspect a planned-but-unexecuted tree -- the write fires whether we + * want it to or not. A `QueryExecutionListener` lets us capture the executedPlan whether the + * write succeeds or throws (some negative fixtures, e.g. encryption, deliberately set values + * that crash the JVM writer at task time). + */ + private def captureWriteExec(tableName: String)(trigger: => Unit): IcebergWriteExec = { + val captured = + new java.util.concurrent.atomic.AtomicReference[org.apache.spark.sql.execution.SparkPlan]() + val listener = new org.apache.spark.sql.util.QueryExecutionListener { + override def onSuccess( + funcName: String, + qe: org.apache.spark.sql.execution.QueryExecution, + durationNs: Long): Unit = + captured.compareAndSet(null, qe.executedPlan) + override def onFailure( + funcName: String, + qe: org.apache.spark.sql.execution.QueryExecution, + exception: Exception): Unit = + captured.compareAndSet(null, qe.executedPlan) + } + // Drain any pending `QueryExecutionListener` events from the prior `createTable` SQL -- + // Spark's `ExecutionListenerBus` delivers events asynchronously, so a CREATE TABLE event + // queued before we register can still arrive after, and our `compareAndSet(null, ...)` + // would capture the wrong plan. + try org.apache.spark.CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + catch { case _: java.util.concurrent.TimeoutException => () } + spark.listenerManager.register(listener) + try { + try trigger + catch { case _: Throwable => () } + try org.apache.spark.CometListenerBusUtils.waitUntilEmpty(spark.sparkContext) + catch { case _: java.util.concurrent.TimeoutException => () } + } finally { + spark.listenerManager.unregister(listener) + } + val plan = Option(captured.get()) + .getOrElse(fail(s"No QueryExecution captured for $tableName")) + findWriteExecOrFail(plan) + } + + private def findWriteExecOrFail( + plan: org.apache.spark.sql.execution.SparkPlan): IcebergWriteExec = + findWriteExec(plan).getOrElse(fail(s"no IcebergWriteExec found in:\n$plan")) + + private def findWriteExec( + plan: org.apache.spark.sql.execution.SparkPlan): Option[IcebergWriteExec] = + plan match { + case e: IcebergWriteExec => Some(e) + case other => + val descend = other.children.iterator ++ wrappedChildren(other).iterator + descend.flatMap(findWriteExec).toSeq.headOption + } + + /** + * Some Spark execs hide their physical tree behind accessors that aren't reported in + * `children`: + * - `CommandResultExec` stores the command tree on `commandPhysicalPlan` (the result is + * already materialised, so `children` returns Nil). + * - `AdaptiveSparkPlanExec` stores its inner plan on `executedPlan` (the wrapped plan re- + * plans on stage materialisation, so `children` is empty by design). Both accessors are + * stable across Spark 3.4-4.0; we pluck them reflectively so the test stays + * version-independent. + */ + private def wrappedChildren(plan: org.apache.spark.sql.execution.SparkPlan) + : Iterable[org.apache.spark.sql.execution.SparkPlan] = { + def viaAccessor(method: String): Option[org.apache.spark.sql.execution.SparkPlan] = + scala.util + .Try { + plan.getClass + .getMethod(method) + .invoke(plan) + .asInstanceOf[org.apache.spark.sql.execution.SparkPlan] + } + .toOption + .filter(_ ne plan) + // `plan` covers Spark 4.0's `ResultQueryStage` (and other `QueryStageExec` subclasses); the + // wrapped plan is held on a field that doesn't make it into `children`. + Seq("commandPhysicalPlan", "executedPlan", "plan").flatMap(viaAccessor) + } + + private def assertSupportLevelIs[T <: SupportLevel: scala.reflect.ClassTag]( + tableName: String): Unit = { + val support = CometIcebergNativeWrite.getSupportLevel(insertWriteExec(tableName)) + val expected = scala.reflect.classTag[T].runtimeClass + assert( + expected.isInstance(support), + s"expected ${expected.getSimpleName} for $tableName, got $support") + } + + /** + * Pin both that the gate returns `Unsupported` and that the reason string contains every + * fragment in `fragments`. Substring matching keeps tests resilient to small phrasing changes + * while still catching wrong-trigger drift (a different gate firing would surface a different + * fragment set). + */ + private def assertUnsupportedContains(tableName: String, fragments: String*): Unit = { + val support = CometIcebergNativeWrite.getSupportLevel(insertWriteExec(tableName)) + support match { + case Unsupported(Some(reason)) => + fragments.foreach(f => + assert(reason.contains(f), s"reason '$reason' missing fragment '$f'")) + case Unsupported(None) => + fail("Unsupported without a reason string") + case other => + fail(s"expected Unsupported, got $other") + } + } +} diff --git a/spark/src/test/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslationSuite.scala b/spark/src/test/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslationSuite.scala new file mode 100644 index 0000000000..73b0ab721d --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/serde/operator/IcebergWriteProtoTranslationSuite.scala @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde.operator + +import scala.jdk.CollectionConverters._ + +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, IcebergWriterMode} + +/** + * Unit tests for the protobuf translation. Iceberg must be on the classpath because the property + * keys / numeric defaults are looked up reflectively from `org.apache.iceberg.TableProperties`. + */ +class IcebergWriteProtoTranslationSuite extends AnyFunSuite { + + import IcebergWriteProtoTranslation._ + + private val TestCreatedBy = "Apache DataFusion Comet (test)" + + // -- compression ----------------------------------------------------------- + + test("compression defaults to zstd at parquet-mr's level 3 when no property is set") { + val settings = buildParquetSettings(Map.empty, TestCreatedBy) + assert(settings.getCompression == CompressionCodec.Zstd) + // See the dedicated zstd-default test below for the rationale. + assert(settings.getCompressionLevel == 3) + } + + test("compression maps each Iceberg codec string to the matching enum") { + val mapping = Seq( + "uncompressed" -> CompressionCodec.None, + "none" -> CompressionCodec.None, + "snappy" -> CompressionCodec.Snappy, + "gzip" -> CompressionCodec.Gzip, + "lz4" -> CompressionCodec.Lz4, + "zstd" -> CompressionCodec.Zstd, + "brotli" -> CompressionCodec.Brotli) + mapping.foreach { case (codec, expected) => + val settings = + buildParquetSettings(Map(Keys.ParquetCompression -> codec), TestCreatedBy) + assert(settings.getCompression == expected, s"codec=$codec") + } + } + + test("compression codec parsing is case-insensitive") { + val settings = + buildParquetSettings(Map(Keys.ParquetCompression -> "ZSTD"), TestCreatedBy) + assert(settings.getCompression == CompressionCodec.Zstd) + } + + test("unknown compression codec throws") { + val ex = intercept[IllegalArgumentException] { + buildParquetSettings(Map(Keys.ParquetCompression -> "lzo"), TestCreatedBy) + } + assert(ex.getMessage.contains("lzo")) + } + + test("compression level is emitted when present and parseable") { + val settings = buildParquetSettings(Map(Keys.ParquetCompressionLevel -> "9"), TestCreatedBy) + assert(settings.hasCompressionLevel) + assert(settings.getCompressionLevel == 9) + } + + test("non-numeric compression level is silently dropped for non-zstd codecs") { + val settings = + buildParquetSettings( + Map(Keys.ParquetCompression -> "snappy", Keys.ParquetCompressionLevel -> "fast"), + TestCreatedBy) + assert(!settings.hasCompressionLevel) + } + + test("zstd without explicit level gets parquet-mr's default of 3") { + // parquet-mr defaults zstd to level 3 but parquet-rs defaults to 1; we substitute 3 to keep + // file sizes consistent with iceberg-java. + val settings = buildParquetSettings(Map.empty, TestCreatedBy) + assert(settings.getCompression == CompressionCodec.Zstd) + assert(settings.hasCompressionLevel) + assert(settings.getCompressionLevel == 3) + } + + test("explicit zstd compression level wins over the parquet-mr default") { + val settings = buildParquetSettings( + Map(Keys.ParquetCompression -> "zstd", Keys.ParquetCompressionLevel -> "15"), + TestCreatedBy) + assert(settings.getCompressionLevel == 15) + } + + test("non-zstd codecs without explicit level emit no level") { + Seq("snappy", "gzip", "lz4", "brotli", "none").foreach { codec => + val settings = + buildParquetSettings(Map(Keys.ParquetCompression -> codec), TestCreatedBy) + assert(!settings.hasCompressionLevel, s"codec=$codec should leave compression level unset") + } + } + + // -- sizes / limits -------------------------------------------------------- + + test("row-group/page/dict sizes fall back to Iceberg defaults when unset") { + val settings = buildParquetSettings(Map.empty, TestCreatedBy) + assert(settings.getRowGroupSizeBytes == Defaults.RowGroupSizeBytes) + assert(settings.getPageSizeBytes == Defaults.PageSizeBytes) + assert(settings.getDictSizeBytes == Defaults.DictSizeBytes) + assert(settings.getPageRowLimit == Defaults.PageRowLimit) + } + + test("sizes are read from properties when set") { + val settings = buildParquetSettings( + Map( + Keys.ParquetRowGroupSizeBytes -> "67108864", + Keys.ParquetPageSizeBytes -> "65536", + Keys.ParquetDictSizeBytes -> "1048576", + Keys.ParquetPageRowLimit -> "1000"), + TestCreatedBy) + assert(settings.getRowGroupSizeBytes == 67108864L) + assert(settings.getPageSizeBytes == 65536L) + assert(settings.getDictSizeBytes == 1048576L) + assert(settings.getPageRowLimit == 1000) + } + + // -- metrics mode (table-level) ------------------------------------------- + + test("metrics default absent -> Iceberg's truncate(16) default applies") { + val settings = buildParquetSettings(Map.empty, TestCreatedBy) + assert(settings.getDefaultStatisticsEnabled) + assert(settings.hasStatisticsTruncateLength) + assert(settings.getStatisticsTruncateLength == 16) + } + + test("metrics default 'full' -> stats enabled, no truncation") { + val settings = buildParquetSettings(Map(Keys.MetricsModeDefault -> "full"), TestCreatedBy) + assert(settings.getDefaultStatisticsEnabled) + assert(!settings.hasStatisticsTruncateLength) + } + + test("metrics default 'truncate(N)' -> stats enabled, truncate length set") { + val settings = + buildParquetSettings(Map(Keys.MetricsModeDefault -> "truncate(32)"), TestCreatedBy) + assert(settings.getDefaultStatisticsEnabled) + assert(settings.hasStatisticsTruncateLength) + assert(settings.getStatisticsTruncateLength == 32) + } + + test("metrics default 'none' -> stats disabled, no truncation") { + val settings = buildParquetSettings(Map(Keys.MetricsModeDefault -> "none"), TestCreatedBy) + assert(!settings.getDefaultStatisticsEnabled) + assert(!settings.hasStatisticsTruncateLength) + } + + test("metrics default 'counts' throws (must be filtered by detection rule)") { + val ex = intercept[IllegalArgumentException] { + buildParquetSettings(Map(Keys.MetricsModeDefault -> "counts"), TestCreatedBy) + } + assert(ex.getMessage.contains("counts")) + } + + // -- per-column overrides ------------------------------------------------- + + test("per-column metrics override emits enabled+truncate length") { + val settings = buildParquetSettings( + Map( + s"${Keys.MetricsModeColumnPrefix}id" -> "full", + s"${Keys.MetricsModeColumnPrefix}name" -> "truncate(8)", + s"${Keys.MetricsModeColumnPrefix}payload" -> "none"), + TestCreatedBy) + val byColumn = + settings.getColumnStatisticsList.asScala.map(s => s.getColumn -> s).toMap + + assert(byColumn("id").getEnabled) + assert(!byColumn("id").hasTruncateLength) + + assert(byColumn("name").getEnabled) + assert(byColumn("name").getTruncateLength == 8) + + assert(!byColumn("payload").getEnabled) + assert(!byColumn("payload").hasTruncateLength) + } + + // -- writer mode resolution ----------------------------------------------- + + test("resolveWriterMode picks UNPARTITIONED for unpartitioned spec regardless of fanout") { + assert( + resolveWriterMode(specIsUnpartitioned = true, useFanoutWriter = false) == + IcebergWriterMode.ICEBERG_WRITER_UNPARTITIONED) + assert( + resolveWriterMode(specIsUnpartitioned = true, useFanoutWriter = true) == + IcebergWriterMode.ICEBERG_WRITER_UNPARTITIONED) + } + + test("resolveWriterMode picks FANOUT for partitioned + fanout") { + assert( + resolveWriterMode(specIsUnpartitioned = false, useFanoutWriter = true) == + IcebergWriterMode.ICEBERG_WRITER_FANOUT) + } + + test("resolveWriterMode picks CLUSTERED for partitioned + non-fanout") { + assert( + resolveWriterMode(specIsUnpartitioned = false, useFanoutWriter = false) == + IcebergWriterMode.ICEBERG_WRITER_CLUSTERED) + } + + // -- common message -------------------------------------------------------- + + test("buildCommon round-trips every field") { + val settings = buildParquetSettings(Map.empty, TestCreatedBy) + val common = buildCommon( + catalogProperties = Map("s3.access-key-id" -> "AKIA", "s3.region" -> "us-east-1"), + metadataLocation = "s3://bucket/warehouse/db/t/metadata/v3.metadata.json", + icebergSchemaJson = """{"schema-id":0,"fields":[]}""", + partitionSpecJson = """{"spec-id":0,"fields":[]}""", + sortOrderId = 4, + dataLocation = "s3://bucket/warehouse/db/t/data", + operationId = "abc-123", + targetFileSizeBytes = 512L * 1024 * 1024, + writerMode = IcebergWriterMode.ICEBERG_WRITER_CLUSTERED, + parquetSettings = settings, + catalogName = Some("prod_glue")) + + assert(common.getMetadataLocation == "s3://bucket/warehouse/db/t/metadata/v3.metadata.json") + assert(common.getIcebergSchemaJson == """{"schema-id":0,"fields":[]}""") + assert(common.getPartitionSpecJson == """{"spec-id":0,"fields":[]}""") + assert(common.getSortOrderId == 4) + assert(common.getDataLocation == "s3://bucket/warehouse/db/t/data") + assert(common.getOperationId == "abc-123") + assert(common.getTargetFileSizeBytes == 512L * 1024 * 1024) + assert(common.getWriterMode == IcebergWriterMode.ICEBERG_WRITER_CLUSTERED) + assert(common.getParquetSettings == settings) + assert(common.getCatalogName == "prod_glue") + val cp = common.getCatalogPropertiesMap + assert(cp.get("s3.access-key-id") == "AKIA") + assert(cp.get("s3.region") == "us-east-1") + } + + test("buildCommon omits the catalog map when no entries are provided") { + val common = buildCommon( + catalogProperties = Map.empty, + metadataLocation = "file:/tmp/t/metadata/v1.metadata.json", + icebergSchemaJson = "{}", + partitionSpecJson = "{}", + sortOrderId = 0, + dataLocation = "file:/tmp/t/data", + operationId = "op", + targetFileSizeBytes = 1024L, + writerMode = IcebergWriterMode.ICEBERG_WRITER_UNPARTITIONED, + parquetSettings = buildParquetSettings(Map.empty, TestCreatedBy), + catalogName = None) + assert(common.getCatalogPropertiesMap.isEmpty) + assert(common.getCatalogName.isEmpty) + } + + // -- created_by passthrough ------------------------------------------------ + + test("createdBy is written into the settings message verbatim") { + val settings = buildParquetSettings(Map.empty, "Some Custom Identifier") + assert(settings.getCreatedBy == "Some Custom Identifier") + } +}