From d4fc959a8e9d88159aad9e0270c7ea357b09a0c5 Mon Sep 17 00:00:00 2001 From: Corie Watson Date: Fri, 16 May 2025 01:30:16 +0100 Subject: [PATCH 1/2] fix(firestore-bigquery-change-tracker): keep partition value on delete using old data --- .../firestore-bigquery-change-tracker/package.json | 2 +- .../src/bigquery/partitioning.ts | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json b/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json index 03acd78ac..ca4be5e9d 100644 --- a/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json +++ b/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json @@ -5,7 +5,7 @@ "url": "github.com/firebase/extensions.git", "directory": "firestore-bigquery-export/firestore-bigquery-change-tracker" }, - "version": "1.1.41", + "version": "1.1.42", "description": "Core change-tracker library for Cloud Firestore Collection BigQuery Exports", "main": "./lib/index.js", "scripts": { diff --git a/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts b/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts index 1d3a468fe..93ce0b206 100644 --- a/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts +++ b/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts @@ -1,5 +1,5 @@ import { FirestoreBigQueryEventHistoryTrackerConfig } from "."; -import { FirestoreDocumentChangeEvent } from ".."; +import { ChangeType, FirestoreDocumentChangeEvent } from ".."; import * as firebase from "firebase-admin"; import * as logs from "../logs"; @@ -7,7 +7,6 @@ import * as bigquery from "@google-cloud/bigquery"; import * as functions from "firebase-functions"; import { getNewPartitionField } from "./schema"; import { BigQuery, TableMetadata } from "@google-cloud/bigquery"; - import { PartitionFieldType } from "../types"; export class Partitioning { @@ -195,11 +194,16 @@ export class Partitioning { Delete changes events have no data, return early as cannot partition on empty data. **/ getPartitionValue(event: FirestoreDocumentChangeEvent) { - if (!event.data) return {}; + // When old data is disabled and the operation is delete + // the data and old data will be null + if (event.data == null && event.oldData == null) return {}; const firestoreFieldName = this.config.timePartitioningFirestoreField; const fieldName = this.config.timePartitioningField; - const fieldValue = event.data[firestoreFieldName]; + const fieldValue = + event.operation === ChangeType.DELETE + ? event.oldData[firestoreFieldName] + : event.data[firestoreFieldName]; if (!fieldName || !fieldValue) { return {}; From ab9a148edd91c1016f892a7eb9f368e43e613040 Mon Sep 17 00:00:00 2001 From: Jacob Cable <32874567+cabljac@users.noreply.github.com> Date: Mon, 19 May 2025 15:55:33 +0100 Subject: [PATCH 2/2] fix(firestore-bigquery-export): use latest change tracker (#2429) * fix(firestore-bigquery-export): improve DELETE operation handling and documentation * chore(firestore-bigquery-export): bump extension version --- firestore-bigquery-export/CHANGELOG.md | 8 +++- firestore-bigquery-export/extension.yaml | 2 +- .../functions/package-lock.json | 8 ++-- .../functions/package.json | 2 +- .../guides/EXAMPLE_QUERIES.md | 42 +++++++++++++++++++ 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/firestore-bigquery-export/CHANGELOG.md b/firestore-bigquery-export/CHANGELOG.md index 653f7b117..20216e594 100644 --- a/firestore-bigquery-export/CHANGELOG.md +++ b/firestore-bigquery-export/CHANGELOG.md @@ -1,3 +1,9 @@ +## Version 0.2.5 + +fix: keep partition value on delete using old data + +docs: improve "Remove stale data" query in guide + ## Version 0.2.4 feat: Add bigquery dataset locations and remove duplicates @@ -10,7 +16,7 @@ fix: pass full document resource name to bigquery fix: remove default value on DATABASE_REGION -## Versions 0.2.1 +## Version 0.2.1 fix: correct database region params and make mutable diff --git a/firestore-bigquery-export/extension.yaml b/firestore-bigquery-export/extension.yaml index 744895f44..7721a7be9 100644 --- a/firestore-bigquery-export/extension.yaml +++ b/firestore-bigquery-export/extension.yaml @@ -13,7 +13,7 @@ # limitations under the License. name: firestore-bigquery-export -version: 0.2.4 +version: 0.2.5 specVersion: v1beta displayName: Stream Firestore to BigQuery diff --git a/firestore-bigquery-export/functions/package-lock.json b/firestore-bigquery-export/functions/package-lock.json index a8e5376e9..01cdad9f9 100644 --- a/firestore-bigquery-export/functions/package-lock.json +++ b/firestore-bigquery-export/functions/package-lock.json @@ -7,7 +7,7 @@ "name": "firestore-bigquery-export", "license": "Apache-2.0", "dependencies": { - "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.41", + "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.42", "@google-cloud/bigquery": "^7.6.0", "@types/chai": "^4.1.6", "@types/express-serve-static-core": "4.17.30", @@ -588,9 +588,9 @@ } }, "node_modules/@firebaseextensions/firestore-bigquery-change-tracker": { - "version": "1.1.41", - "resolved": "https://registry.npmjs.org/@firebaseextensions/firestore-bigquery-change-tracker/-/firestore-bigquery-change-tracker-1.1.41.tgz", - "integrity": "sha512-obyAc5aeOGb1NH/dxGo2Ndu7DHkqpgYyLn7vjwMtiM3rLmCgBuX/O15FZ3K2ytbBGwrhMYg/uY7KwOC1UBxlSg==", + "version": "1.1.42", + "resolved": "https://registry.npmjs.org/@firebaseextensions/firestore-bigquery-change-tracker/-/firestore-bigquery-change-tracker-1.1.42.tgz", + "integrity": "sha512-IdGKcVoLGZOZNlGMGye3AndiobqEK9by/3TgOCy/AUs+YbcCQOFPopkt7Q0jHbj0VHeFHfGV0vg5ngI0tG2TMg==", "dependencies": { "@google-cloud/bigquery": "^7.6.0", "@google-cloud/resource-manager": "^5.1.0", diff --git a/firestore-bigquery-export/functions/package.json b/firestore-bigquery-export/functions/package.json index 1d5a50285..d9f1ec007 100644 --- a/firestore-bigquery-export/functions/package.json +++ b/firestore-bigquery-export/functions/package.json @@ -13,7 +13,7 @@ "author": "Jan Wyszynski ", "license": "Apache-2.0", "dependencies": { - "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.41", + "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.42", "@google-cloud/bigquery": "^7.6.0", "@types/chai": "^4.1.6", "@types/express-serve-static-core": "4.17.30", diff --git a/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md b/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md index 77c1fa1c7..3c2998955 100644 --- a/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md +++ b/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md @@ -115,6 +115,10 @@ If you want to clean up data from your `changelog` table, use the following `DELETE` query to delete all rows that fall within a certain time period, e.g. greater than 1 month old. +#### Option 1: Remove stale changelog records but keep latest change per document (default) + +If you want to remove all entries that are over one month old, regardless of whether they are the latest change for a document (e.g., including DELETE operations), use the following query: + ```sql /* The query below deletes any rows below that are over one month old. */ DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` @@ -132,3 +136,41 @@ WHERE (document_name, timestamp) IN AND DATETIME(t.timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) ) ``` + +⚠️ Note: This query will remove all entries older than one month, including the most recent record for documents whose last change (e.g., a DELETE) happened more than a month ago. Use this only if you do not need to retain full historical state in your changelog table. + +#### Option 2: Remove all changelog records older than one month — including latest DELETE operations + +If you want to remove all entries that are over one month old, regardless of whether they are the latest change for a document (e.g., including DELETE operations), use the following query: + +```sql +/* Deletes all changelog records older than one month, including latest DELETEs */ +DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` +WHERE DATETIME(timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) +``` + +#### Option 3: Remove all changelog records older than one month, including latest DELETE operations only + +This option removes all old records, and it will also delete DELETE operations even if they are the latest change for a document — as long as they are older than one month. + +Use this if you want to aggressively clean up deleted documents from your changelog, even if that means latest views will no longer reflect that those documents were deleted. + +```sql +/* Deletes any changelog records over one month old, + including DELETEs that are the latest entry for a document */ +DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` +WHERE (document_name, timestamp) IN ( + WITH latest AS ( + SELECT MAX(timestamp) AS timestamp, document_name + FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` + GROUP BY document_name + ) + SELECT (t.document_name, t.timestamp) + FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` AS t + JOIN latest ON t.document_name = latest.document_name + WHERE (t.timestamp != latest.timestamp OR t.operation = 'DELETE') + AND DATETIME(t.timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) +) +``` + +⚠️ Note: This will remove DELETE records that are older than one month even if they are the most recent change. As a result, your \_latest view will no longer show that those documents were deleted — they may appear as if they never existed. Use this option only if that behavior is acceptable for your use case.