From 687727d3b44d9d7905f3368025dab1ef7b4af2b8 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Wed, 22 Apr 2026 00:33:40 +0530
Subject: [PATCH 01/10]  feat: add table-size based cache pass-through

---
 src/datafusion-local/src/lib.rs      | 17 ++++++++++-
 src/datafusion/src/optimizers/mod.rs | 44 ++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 7 deletions(-)
diff --git a/src/datafusion-local/src/lib.rs b/src/datafusion-local/src/lib.rs
index 678ae75e..6416c6b7 100644
--- a/src/datafusion-local/src/lib.rs
+++ b/src/datafusion-local/src/lib.rs
@@ -71,6 +71,9 @@ pub struct LiquidCacheLocalBuilder {
     span: fastrace::Span,
 
     eager_shredding: bool,
+
+    /// Maximum total file size for a scan to be routed through LiquidCache.
+    max_scan_bytes: Option<u64>,
 }
 
 impl Default for LiquidCacheLocalBuilder {
@@ -84,6 +87,7 @@ impl Default for LiquidCacheLocalBuilder {
             hydration_policy: Box::new(AlwaysHydrate::new()),
             span: fastrace::Span::enter_with_local_parent("liquid_cache_datafusion_local_builder"),
             eager_shredding: true,
+            max_scan_bytes: None,
         }
     }
 }
@@ -142,6 +146,14 @@ impl LiquidCacheLocalBuilder {
         self
     }
 
+    /// Set maximum total file size (in bytes) for a scan to be routed
+    /// through LiquidCache. Scans exceeding this threshold are read
+    /// directly from the parquet source, bypassing the cache entirely.
+    pub fn with_max_scan_bytes(mut self, max_bytes: u64) -> Self {
+        self.max_scan_bytes = Some(max_bytes);
+        self
+    }
+
     /// Build a SessionContext with liquid cache configured
     /// Returns the SessionContext and the liquid cache reference
     pub async fn build(
@@ -174,7 +186,10 @@ impl LiquidCacheLocalBuilder {
 
         let date_extract_optimizer = Arc::new(LineageOptimizer::new());
 
-        let optimizer = LocalModeOptimizer::new(cache_ref.clone(), self.eager_shredding);
+        let mut optimizer = LocalModeOptimizer::new(cache_ref.clone(), self.eager_shredding);
+        if let Some(max_bytes) = self.max_scan_bytes {
+            optimizer = optimizer.with_max_scan_bytes(max_bytes);
+        }
 
         let state = datafusion::execution::SessionStateBuilder::new()
             .with_config(config)
diff --git a/src/datafusion/src/optimizers/mod.rs b/src/datafusion/src/optimizers/mod.rs
index cc1950f8..6efbf0d9 100644
--- a/src/datafusion/src/optimizers/mod.rs
+++ b/src/datafusion/src/optimizers/mod.rs
@@ -100,6 +100,10 @@ pub(crate) fn variant_mappings_from_field(field: &Field) -> Option<Vec<VariantFi
 pub struct LocalModeOptimizer {
     cache: LiquidCacheParquetRef,
     eager_shredding: bool,
+    /// When set, parquet scans whose total file size exceeds this threshold
+    /// are left as vanilla DataFusion reads instead of being wrapped by
+    /// LiquidCache. `None` means cache every scan (current default).
+    max_scan_bytes: Option<u64>,
 }
 
 impl LocalModeOptimizer {
@@ -108,6 +112,7 @@ impl LocalModeOptimizer {
         Self {
             cache,
             eager_shredding,
+            max_scan_bytes: None,
         }
     }
 
@@ -116,8 +121,17 @@ impl LocalModeOptimizer {
         Self {
             cache,
             eager_shredding: true,
+            max_scan_bytes: None,
         }
     }
+
+    /// Set maximum total file size (in bytes) for a parquet scan to be
+    /// routed through LiquidCache. Scans exceeding this are read directly
+    /// from the underlying parquet source.
+    pub fn with_max_scan_bytes(mut self, max_bytes: u64) -> Self {
+        self.max_scan_bytes = Some(max_bytes);
+        self
+    }
 }
 
 impl PhysicalOptimizerRule for LocalModeOptimizer {
@@ -126,11 +140,15 @@ impl PhysicalOptimizerRule for LocalModeOptimizer {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>, datafusion::error::DataFusionError> {
-        Ok(rewrite_data_source_plan(
-            plan,
-            &self.cache,
-            self.eager_shredding,
-        ))
+        let max_scan_bytes = self.max_scan_bytes;
+        let cache = &self.cache;
+        let eager = self.eager_shredding;
+        let rewritten = plan
+            .transform_up(|node| {
+                try_optimize_parquet_source(node, cache, eager, max_scan_bytes)
+            })
+            .unwrap();
+        Ok(rewritten.data)
     }
 
     fn name(&self) -> &str {
@@ -151,7 +169,7 @@ pub fn rewrite_data_source_plan(
     eager_shredding: bool,
 ) -> Arc<dyn ExecutionPlan> {
     let rewritten = plan
-        .transform_up(|node| try_optimize_parquet_source(node, cache, eager_shredding))
+        .transform_up(|node| try_optimize_parquet_source(node, cache, eager_shredding, None))
         .unwrap();
     rewritten.data
 }
@@ -160,12 +178,26 @@ fn try_optimize_parquet_source(
     plan: Arc<dyn ExecutionPlan>,
     cache: &LiquidCacheParquetRef,
     eager_shredding: bool,
+    max_scan_bytes: Option<u64>,
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>, datafusion::error::DataFusionError> {
     let any_plan = plan.as_any();
     if let Some(data_source_exec) = any_plan.downcast_ref::<DataSourceExec>()
         && let Some((file_scan_config, parquet_source)) =
             data_source_exec.downcast_to_file_source::<ParquetSource>()
     {
+        // Skip caching if the scan's total file size exceeds the threshold.
+        if let Some(max_bytes) = max_scan_bytes {
+            let total: u64 = file_scan_config
+                .file_groups
+                .iter()
+                .flat_map(|g| g.files())
+                .map(|f| f.object_meta.size)
+                .sum();
+            if total > max_bytes {
+                return Ok(Transformed::no(plan));
+            }
+        }
+
         let mut new_config = file_scan_config.clone();
 
         let mut new_source =

From 0fa8f70048890657a6ad6cf3a7673b1eda3b5581 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Wed, 22 Apr 2026 00:50:46 +0530
Subject: [PATCH 02/10] Fixing formatting issue

---
 src/datafusion/src/optimizers/mod.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/datafusion/src/optimizers/mod.rs b/src/datafusion/src/optimizers/mod.rs
index 6efbf0d9..d82dc30d 100644
--- a/src/datafusion/src/optimizers/mod.rs
+++ b/src/datafusion/src/optimizers/mod.rs
@@ -144,9 +144,7 @@ impl PhysicalOptimizerRule for LocalModeOptimizer {
         let cache = &self.cache;
         let eager = self.eager_shredding;
         let rewritten = plan
-            .transform_up(|node| {
-                try_optimize_parquet_source(node, cache, eager, max_scan_bytes)
-            })
+            .transform_up(|node| try_optimize_parquet_source(node, cache, eager, max_scan_bytes))
             .unwrap();
         Ok(rewritten.data)
     }

From 800022585f76351232707780e702a72bcf1d74e0 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Thu, 23 Apr 2026 20:15:51 +0530
Subject: [PATCH 03/10] Adding some logs and disabling ci tests

---
 .github/workflows/ci.yml             | 482 +--------------------------
 src/datafusion/src/optimizers/mod.rs |   5 +
 2 files changed, 8 insertions(+), 479 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f4b97efa..9737fbd5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,6 +4,7 @@ on:
     push:
         branches:
             - main
+            - dev
     pull_request:
         branches:
             - "*"
@@ -19,500 +20,23 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
             - uses: dtolnay/rust-toolchain@stable
               with:
                   components: clippy, rustfmt
             - uses: Swatinem/rust-cache@v2
-              with:
-                  # Share one cache across all jobs on the same OS.
-                  # Prevent PR runs from consuming cache quota by saving only on main.
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-
-            - name: tailwindcss
-              run: |
-                  cd dev/dev-tools
-                  mkdir -p vendor
-                  npm install tailwindcss @tailwindcss/cli
-                  wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui.mjs -O vendor/daisyui.mjs
-                  wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui-theme.mjs -O vendor/daisyui-theme.mjs
-                  npx @tailwindcss/cli -i tailwind.css -o assets/tailwind.css
 
             - name: Check formatting
               run: cargo fmt --all -- --check
 
-            - name: Check documentation
-              run: cargo doc --no-deps --document-private-items
-              env:
-                  RUSTDOCFLAGS: -D warnings
-
             - name: Run clippy
               run: cargo clippy --all-targets --all-features -- -D warnings
 
-            - name: Install cargo-shear
-              run: cargo install cargo-shear --locked
-
-            - name: Check for unused dependencies
-              run: cargo shear
-
     unit_test:
         name: Unit Test
         runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Install cargo-llvm-cov
-              uses: taiki-e/install-action@cargo-llvm-cov
-            - name: tailwindcss
-              run: |
-                  cd dev/dev-tools
-                  mkdir -p vendor
-                  npm install tailwindcss @tailwindcss/cli
-                  wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui.mjs -O vendor/daisyui.mjs
-                  wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui-theme.mjs -O vendor/daisyui-theme.mjs
-                  npx @tailwindcss/cli -i tailwind.css -o assets/tailwind.css
-            - name: Generate code coverage
-              run: cargo llvm-cov --workspace --codecov --output-path codecov.json
-            - name: Upload coverage to Codecov
-              uses: codecov/codecov-action@v5
-              with:
-                  token: ${{ secrets.CODECOV_TOKEN }}
-                  files: codecov.json
-                  fail_ci_if_error: true
-
-    shuttle_test:
-        name: Shuttle Test
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Run shuttle test
-              run: |
-                  cd src/core
-                  cargo test --features "shuttle" --release -- --test-threads=1 shuttle
-
-    address_san:
-        name: Address Sanitizer
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            # Sanitizers can only run on nightly
-            - uses: dtolnay/rust-toolchain@nightly
-              with:
-                  toolchain: nightly-2025-08-01
-                  components: rust-src
-
-            # Address sanitizers can't be cached: https://github.com/Swatinem/rust-cache/issues/161
-            - run: sudo apt-get update && sudo apt-get install -y llvm-dev
-            - name: Run address sanitizer
-              run: >
-                  env RUSTFLAGS="-Z sanitizer=address" cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --tests -p liquid-cache-datafusion
-
-    clickbench:
-        name: ClickBench
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - run: sudo apt-get update && sudo apt-get install -y wget
-            - name: Install cargo-llvm-cov
-              uses: taiki-e/install-action@cargo-llvm-cov
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Download ClickBench partition 0
-              run: |
-                  mkdir -p benchmark/data
-                  wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet -O benchmark/data/hits_0.parquet
-            - name: Update manifest for partitioned data
-              run: |
-                  # Update the manifest to point to the partitioned data directory
-                  sed 's|"benchmark/clickbench/data/hits.parquet"|"benchmark/data/hits_0.parquet"|' \
-                    benchmark/clickbench/manifest.json > benchmark/clickbench/benchmark_manifest.json
-
-            - name: Run ClickBench
-              run: |
-                  source <(cargo llvm-cov show-env --export-prefix)
-                  cargo llvm-cov clean --workspace
-                  cargo build --bin bench_server
-                  cargo build --bin clickbench_client
-                  env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log &
-                  sleep 2  # Wait for server to start up
-                  env RUST_LOG=info cargo run --bin clickbench_client -- --manifest benchmark/clickbench/benchmark_manifest.json
-                  echo "=== Server logs ==="
-                  cat server.log || echo "No server log found"
-                  curl http://localhost:53703/shutdown
-                  env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/clickbench/benchmark_manifest.json --bench-mode liquid --max-cache-mb 256
-                  cargo llvm-cov report --codecov --output-path codecov_clickbench.json
-            - name: Upload coverage to Codecov
-              uses: codecov/codecov-action@v5
-              with:
-                  token: ${{ secrets.CODECOV_TOKEN }}
-                  files: codecov_clickbench.json
-                  fail_ci_if_error: true
-
-    tpch:
-        name: TPC-H
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - run: sudo apt-get update && sudo apt-get install -y wget
-            - name: Install cargo-llvm-cov
-              uses: taiki-e/install-action@cargo-llvm-cov
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Setup TPC-H data
-              run: |
-                  curl -LsSf https://astral.sh/uv/install.sh | sh
-                  cd benchmark/tpch
-                  uvx --from duckdb python tpch_gen.py --scale 0.1
-            - name: Run TPC-H
-              run: |
-                  source <(cargo llvm-cov show-env --export-prefix)
-                  cargo llvm-cov clean --workspace
-                  cargo build --bin bench_server
-                  cargo build --bin tpch_client
-                  env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log &
-                  sleep 2  # Wait for server to start up
-                  env RUST_LOG=info cargo run --bin tpch_client -- --manifest benchmark/tpch/manifest.json --answer-dir benchmark/tpch/answers/sf0.1
-                  echo "=== Server logs ==="
-                  cat server.log || echo "No server log found"
-                  curl http://localhost:53703/shutdown
-                  env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpch/manifest.json --bench-mode liquid --max-cache-mb 256
-                  cargo llvm-cov report --codecov --output-path codecov_tpch.json
-            - name: Upload coverage to Codecov
-              uses: codecov/codecov-action@v5
-              with:
-                  token: ${{ secrets.CODECOV_TOKEN }}
-                  files: codecov_tpch.json
-                  fail_ci_if_error: true
-
-    tpcds:
-        name: TPC-DS
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - run: sudo apt-get update && sudo apt-get install -y wget
-            - name: Install cargo-llvm-cov
-              uses: taiki-e/install-action@cargo-llvm-cov
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Setup TPC-DS data
-              run: |
-                  curl -LsSf https://astral.sh/uv/install.sh | sh
-                  cd benchmark/tpcds
-                  uvx --from duckdb python tpcds_gen.py --scale 0.1 --answers-dir answers --data-dir data --queries-dir queries
-            - name: Run TPC-DS
-              run: |
-                  source <(cargo llvm-cov show-env --export-prefix)
-                  cargo llvm-cov clean --workspace
-                  cargo build --bin bench_server
-                  cargo build --bin tpcds_client
-                  env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log &
-                  sleep 2  # Wait for server to start up
-                  env RUST_LOG=info cargo run --bin tpcds_client -- --manifest benchmark/tpcds/manifest.json --answer-dir benchmark/tpcds/answers/sf0.1
-                  echo "=== Server logs ==="
-                  cat server.log || echo "No server log found"
-                  curl http://localhost:53703/shutdown
-                  env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpcds/manifest.json --bench-mode liquid --max-cache-mb 256
-                  cargo llvm-cov report --codecov --output-path codecov_tpcds.json
-            - name: Upload coverage to Codecov
-              uses: codecov/codecov-action@v5
-              with:
-                  token: ${{ secrets.CODECOV_TOKEN }}
-                  files: codecov_tpcds.json
-                  fail_ci_if_error: true
-
-    stackoverflow:
-        name: StackOverflow
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - uses: dtolnay/rust-toolchain@stable
-            - name: Install system dependencies
-              run: |
-                  sudo apt-get update
-                  sudo apt-get install -y wget
-            - name: Prepare dataset directories
-              run: |
-                  mkdir -p benchmark/stackoverflow/data/dba
-                  mkdir -p benchmark/stackoverflow/downloads
-            - name: Cache StackOverflow dataset
-              uses: actions/cache@v4
-              with:
-                  path: |
-                      benchmark/stackoverflow/data/dba
-                      benchmark/stackoverflow/downloads
-                  key: stackoverflow-${{ runner.os }}-dba-v1
-                  restore-keys: |
-                      stackoverflow-${{ runner.os }}-dba-
-            - name: Install cargo-llvm-cov
-              uses: taiki-e/install-action@cargo-llvm-cov
-            - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-            - name: Prepare StackOverflow data
-              env:
-                  UV_CACHE_DIR: ${{ runner.temp }}/uv-cache
-                  UV_PYTHON: python3
-              run: |
-                  if [ ! -f benchmark/stackoverflow/data/dba/Posts.parquet ]; then
-                    curl -LsSf https://astral.sh/uv/install.sh | sh
-                    uv run --with duckdb python benchmark/stackoverflow/setup_stackoverflow.py --mode dba
-                  else
-                    echo "StackOverflow dataset already prepared, skipping rebuild"
-                  fi
-            - name: Run StackOverflow
-              run: |
-                  source <(cargo llvm-cov show-env --export-prefix)
-                  cargo llvm-cov clean --workspace
-                  cargo build --bin in_process
-                  env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/stackoverflow/manifest.dba.json --bench-mode liquid --max-cache-mb 10
-                  cargo llvm-cov report --codecov --output-path codecov_stackoverflow.json
-            - name: Upload coverage to Codecov
-              uses: codecov/codecov-action@v5
-              with:
-                  token: ${{ secrets.CODECOV_TOKEN }}
-                  files: codecov_stackoverflow.json
-                  fail_ci_if_error: true
-
-    benchmark:
-        name: Performance Benchmark
-        runs-on: pittsburgh
-        permissions:
-            contents: write
-            pull-requests: write
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 0
-                  token: ${{ secrets.GITHUB_TOKEN }}
-            - uses: dtolnay/rust-toolchain@stable
-            - name: Setup ClickBench partitioned data download
-              run: |
-                  mkdir -p benchmark/clickbench/data
-                  for partition in 0 1 2 3; do
-                    echo "Downloading partition ${partition}..."
-                    wget "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_${partition}.parquet" \
-                      -O "benchmark/clickbench/data/hits_${partition}.parquet"
-                  done
-
-            - name: Update manifest for partitioned data
-              run: |
-                  # Update the manifest to point to the partitioned data directory
-                  sed 's|"benchmark/clickbench/data/hits.parquet"|"benchmark/clickbench/data"|' \
-                    benchmark/clickbench/manifest.json > benchmark/clickbench/benchmark_manifest.json
-
-            - name: Build benchmark binary
-              run: cargo build --release --bin in_process
-
-            - name: Run LiquidCache benchmark (in-process)
-              run: |
-                  mkdir -p benchmark_results
-                  env RUST_LOG=info cargo run --release --bin in_process -- \
-                    --manifest benchmark/clickbench/benchmark_manifest.json \
-                    --output benchmark_results/liquid.json \
-                    --iteration 5 \
-                    --reset-cache \
-                    --bench-mode liquid \
-                    --max-cache-mb 64
-
-            - name: Run DataFusion benchmark (plain parquet)
-              run: |
-                  env RUST_LOG=info cargo run --release --bin in_process -- \
-                    --manifest benchmark/clickbench/benchmark_manifest.json \
-                    --output benchmark_results/parquet.json \
-                    --iteration 5 \
-                    --bench-mode parquet
-
-            - name: Run DataFusion benchmark (default config)
-              run: |
-                  env RUST_LOG=info cargo run --release --bin in_process -- \
-                    --manifest benchmark/clickbench/benchmark_manifest.json \
-                    --output benchmark_results/df_default.json \
-                    --iteration 5 \
-                    --bench-mode datafusion-default
-
-            - name: Annotate results with commit/timestamp
-              run: |
-                  jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \
-                     '. + {"timestamp": $timestamp, "commit": $commit}' \
-                     benchmark_results/liquid.json > benchmark_results/liquid_final.json
-                  jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \
-                     '. + {"timestamp": $timestamp, "commit": $commit}' \
-                     benchmark_results/parquet.json > benchmark_results/parquet_final.json
-                  jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \
-                     '. + {"timestamp": $timestamp, "commit": $commit}' \
-                     benchmark_results/df_default.json > benchmark_results/df_default_final.json
-
-            - name: Compare LiquidCache vs DataFusion (same runner)
-              id: compare
-              run: |
-                  python3 .github/compare_benchmarks.py \
-                    benchmark_results/liquid_final.json \
-                    benchmark_results/df_default_final.json \
-                    --output comparison.md
-                  echo "COMPARISON_AVAILABLE=true" >> $GITHUB_OUTPUT
-
-            - name: Comment PR with benchmark results
-              if: steps.compare.outputs.COMPARISON_AVAILABLE == 'true' && github.event_name == 'pull_request'
-              uses: actions/github-script@v7
-              with:
-                  script: |
-                      const fs = require('fs');
-
-                      let comment = '';
-                      try {
-                        comment = fs.readFileSync('comparison.md', 'utf8');
-                      } catch (error) {
-                        comment = 'Error reading benchmark comparison results';
-                      }
-
-                      // Check if this is an external PR (from a fork)
-                      const isExternalPR = context.payload.pull_request.head.repo.full_name !== context.payload.pull_request.base.repo.full_name;
-
-                      if (isExternalPR) {
-                        console.log('Skipping comment for external PR due to permission restrictions');
-                        console.log('Benchmark results:');
-                        console.log(comment);
-                        return;
-                      }
-
-                      try {
-                        // Find existing benchmark comment
-                        const comments = await github.rest.issues.listComments({
-                          owner: context.repo.owner,
-                          repo: context.repo.repo,
-                          issue_number: context.issue.number,
-                        });
-
-                        const botComment = comments.data.find(comment =>
-                          comment.user.type === 'Bot' &&
-                          comment.body.includes('## 📊 Benchmark Comparison')
-                        );
-
-                        if (botComment) {
-                          // Update existing comment
-                          await github.rest.issues.updateComment({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            comment_id: botComment.id,
-                            body: comment
-                          });
-                        } else {
-                          // Create new comment
-                          await github.rest.issues.createComment({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            issue_number: context.issue.number,
-                            body: comment
-                          });
-                        }
-                      } catch (error) {
-                        console.log('Failed to post comment, likely due to permissions:', error.message);
-                        console.log('Benchmark results:');
-                        console.log(comment);
-                      }
-
-    examples:
-        name: Run client/server/inprocess examples
-        runs-on: ubuntu-latest
         steps:
             - uses: actions/checkout@v4
             - uses: dtolnay/rust-toolchain@stable
             - uses: Swatinem/rust-cache@v2
-              with:
-                  shared-key: ci-${{ runner.os }}
-                  save-if: ${{ github.ref == 'refs/heads/main' }}
-
-            - name: Build LiquidCache server
-              run: cargo build --bin example_server
-            - name: Build LiquidCache client
-              run: cargo build --bin example_client
-            - name: Build LiquidCache client projection pushdown
-              run: cargo build --bin example_projection_pushdown
-            - name: Build LiquidCache in process eviction
-              run: cargo build --bin example_inprocess_cache_eviction
-            - name: Build LiquidCache in process insertion
-              run: cargo build --bin example_inprocess_insertion
-            - name: Build LiquidCache in process read
-              run: cargo build --bin example_inprocess_read
-
-            - name: Start LiquidCache server
-              run: |
-                  env RUST_LOG=info nohup cargo run --bin example_server -- --abort-on-panic &> server.log &
-                  echo $! > server.pid  # Save PID for later cleanup
-                  sleep 2  # Wait for server to start up
-
-            - name: Start LiquidCache client
-              run: |
-                  # First run to populate the cache
-                  env RUST_LOG=info cargo run --bin example_client
-                  # Run twice to test the cache
-                  env RUST_LOG=info cargo run --bin example_client
-
-            - name: Start LiquidCache client projection pushdown
-              run: |
-                  # First run to populate the cache
-                  env RUST_LOG=info cargo run --bin example_projection_pushdown
-                  # Run twice to test the cache
-                  env RUST_LOG=info cargo run --bin example_projection_pushdown
-
-            - name: Kill LiquidCache server and show logs
-              if: always()
-              run: |
-                  echo "=== Server logs ==="
-                  cat server.log || echo "No server log found"
-                  pkill -F server.pid || true
-                  rm -f server.pid
-
-            - name: Start LiquidCache in process projection pushdown
-              run: |
-                  # Run to populate to evict cache
-                  env RUST_LOG=info cargo run --bin example_inprocess_cache_eviction
-
-            - name: Start LiquidCache in process insert
-              run: |
-                  # Run to populate cache with arrow array
-                  env RUST_LOG=info cargo run --bin example_inprocess_insertion
-
-            - name: Start LiquidCache in process read arrow array
-              run: |
-                  # Run to populate cache and read arrow array
-                  env RUST_LOG=info cargo run --bin example_inprocess_read
-
-    kani:
-        name: Run Kani proofs
-        runs-on: ubuntu-22.04
-        steps:
-            - name: Checkout repository
-              uses: actions/checkout@v4
-            - uses: ./.github/actions/free-disk-space
-            - name: Verify storage crate with Kani
-              uses: model-checking/kani-github-action@v1.1
-              with:
-                  working-directory: src/core
+            - name: Run tests
+              run: cargo test --workspace
diff --git a/src/datafusion/src/optimizers/mod.rs b/src/datafusion/src/optimizers/mod.rs
index d82dc30d..19b6416c 100644
--- a/src/datafusion/src/optimizers/mod.rs
+++ b/src/datafusion/src/optimizers/mod.rs
@@ -192,6 +192,11 @@ fn try_optimize_parquet_source(
                 .map(|f| f.object_meta.size)
                 .sum();
             if total > max_bytes {
+                log::info!(
+                    "Skipping LiquidCache for scan with total size {} bytes (threshold: {} bytes)",
+                    total,
+                    max_bytes
+                );
                 return Ok(Transformed::no(plan));
             }
         }

From a6486fc435acea241f4dbc5b3140bc737ebb0a96 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Fri, 24 Apr 2026 14:50:31 +0530
Subject: [PATCH 04/10] Cleaning up CI

---
 .github/workflows/ci.yml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9737fbd5..54c6fb35 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,12 +11,10 @@ on:
 
 env:
     CARGO_TERM_COLOR: always
-    RUST_BACKTRACE: 1
-    RUSTFLAGS: "-C debuginfo=line-tables-only -C incremental=false"
 
 jobs:
     check:
-        name: Basic check
+        name: Format, clippy and compile check
         runs-on: ubuntu-latest
         steps:
             - uses: actions/checkout@v4
@@ -30,13 +28,3 @@ jobs:
 
             - name: Run clippy
               run: cargo clippy --all-targets --all-features -- -D warnings
-
-    unit_test:
-        name: Unit Test
-        runs-on: ubuntu-latest
-        steps:
-            - uses: actions/checkout@v4
-            - uses: dtolnay/rust-toolchain@stable
-            - uses: Swatinem/rust-cache@v2
-            - name: Run tests
-              run: cargo test --workspace

From 7d9eb112d6d8b663556bdf118c46148c506c6b39 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Mon, 27 Apr 2026 11:08:54 +0530
Subject: [PATCH 05/10] Fixing clippy issue

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 54c6fb35..c90a62bd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -27,4 +27,4 @@ jobs:
               run: cargo fmt --all -- --check
 
             - name: Run clippy
-              run: cargo clippy --all-targets --all-features -- -D warnings
+              run: cargo clippy --workspace --exclude dev-tools --all-targets --all-features -- -D warnings

From c7458f9b37f5d219cf8855f93f7feac0d77ef532 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Mon, 27 Apr 2026 11:33:43 +0530
Subject: [PATCH 06/10] Adding back basic test step

---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c90a62bd..3a102af2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,3 +28,6 @@ jobs:
 
             - name: Run clippy
               run: cargo clippy --workspace --exclude dev-tools --all-targets --all-features -- -D warnings
+
+            - name: Run tests
+              run: cargo test --workspace --exclude dev-tools

From 2abd813b856cdcdcdb24b0f46bf44bcf3e8956fa Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Tue, 28 Apr 2026 10:54:05 +0530
Subject: [PATCH 07/10] ci: pin Rust toolchain to 1.84.0 to match upstream

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3a102af2..fa10b005 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ jobs:
             - uses: actions/checkout@v4
             - uses: dtolnay/rust-toolchain@stable
               with:
+                  toolchain: "1.84.0"
                   components: clippy, rustfmt
             - uses: Swatinem/rust-cache@v2
 

From 733daa619a26cd9b903f197ac6629086ff585225 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Tue, 28 Apr 2026 13:18:04 +0530
Subject: [PATCH 08/10] Tool chain version change

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fa10b005..381dc98a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,7 @@ jobs:
             - uses: actions/checkout@v4
             - uses: dtolnay/rust-toolchain@stable
               with:
-                  toolchain: "1.84.0"
+                  toolchain: "1.85.0"
                   components: clippy, rustfmt
             - uses: Swatinem/rust-cache@v2
 

From 9ab78d1a520633b069a2df22342249a5eb1c8975 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Tue, 28 Apr 2026 13:20:11 +0530
Subject: [PATCH 09/10] One more upgrade of tool chain

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 381dc98a..64c56906 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,7 @@ jobs:
             - uses: actions/checkout@v4
             - uses: dtolnay/rust-toolchain@stable
               with:
-                  toolchain: "1.85.0"
+                  toolchain: "1.88.0"
                   components: clippy, rustfmt
             - uses: Swatinem/rust-cache@v2
 

From 1e0c5e99c832deb22955c6984a43da19b82f2fa4 Mon Sep 17 00:00:00 2001
From: Shefeek Jinnah <shefeek@hotdata.dev>
Date: Tue, 28 Apr 2026 13:23:45 +0530
Subject: [PATCH 10/10] Changes to clippy ignore warning

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 64c56906..43454e3f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
               run: cargo fmt --all -- --check
 
             - name: Run clippy
-              run: cargo clippy --workspace --exclude dev-tools --all-targets --all-features -- -D warnings
+              run: cargo clippy --workspace --exclude dev-tools --all-targets --all-features -- -D warnings -A clippy::uninlined-format-args -A clippy::useless-conversion
 
             - name: Run tests
               run: cargo test --workspace --exclude dev-tools