diff --git a/firestore-bigquery-export/CHANGELOG.md b/firestore-bigquery-export/CHANGELOG.md index 653f7b117..20216e594 100644 --- a/firestore-bigquery-export/CHANGELOG.md +++ b/firestore-bigquery-export/CHANGELOG.md @@ -1,3 +1,9 @@ +## Version 0.2.5 + +fix: keep partition value on delete using old data + +docs: improve "Remove stale data" query in guide + ## Version 0.2.4 feat: Add bigquery dataset locations and remove duplicates @@ -10,7 +16,7 @@ fix: pass full document resource name to bigquery fix: remove default value on DATABASE_REGION -## Versions 0.2.1 +## Version 0.2.1 fix: correct database region params and make mutable diff --git a/firestore-bigquery-export/extension.yaml b/firestore-bigquery-export/extension.yaml index 744895f44..7721a7be9 100644 --- a/firestore-bigquery-export/extension.yaml +++ b/firestore-bigquery-export/extension.yaml @@ -13,7 +13,7 @@ # limitations under the License. name: firestore-bigquery-export -version: 0.2.4 +version: 0.2.5 specVersion: v1beta displayName: Stream Firestore to BigQuery diff --git a/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json b/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json index 03acd78ac..ca4be5e9d 100644 --- a/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json +++ b/firestore-bigquery-export/firestore-bigquery-change-tracker/package.json @@ -5,7 +5,7 @@ "url": "github.com/firebase/extensions.git", "directory": "firestore-bigquery-export/firestore-bigquery-change-tracker" }, - "version": "1.1.41", + "version": "1.1.42", "description": "Core change-tracker library for Cloud Firestore Collection BigQuery Exports", "main": "./lib/index.js", "scripts": { diff --git a/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts b/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts index 1d3a468fe..93ce0b206 100644 --- a/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts +++ b/firestore-bigquery-export/firestore-bigquery-change-tracker/src/bigquery/partitioning.ts @@ -1,5 +1,5 @@ import { FirestoreBigQueryEventHistoryTrackerConfig } from "."; -import { FirestoreDocumentChangeEvent } from ".."; +import { ChangeType, FirestoreDocumentChangeEvent } from ".."; import * as firebase from "firebase-admin"; import * as logs from "../logs"; @@ -7,7 +7,6 @@ import * as bigquery from "@google-cloud/bigquery"; import * as functions from "firebase-functions"; import { getNewPartitionField } from "./schema"; import { BigQuery, TableMetadata } from "@google-cloud/bigquery"; - import { PartitionFieldType } from "../types"; export class Partitioning { @@ -195,11 +194,16 @@ export class Partitioning { Delete changes events have no data, return early as cannot partition on empty data. **/ getPartitionValue(event: FirestoreDocumentChangeEvent) { - if (!event.data) return {}; + // When old data is disabled and the operation is delete + // the data and old data will be null + if (event.data == null && event.oldData == null) return {}; const firestoreFieldName = this.config.timePartitioningFirestoreField; const fieldName = this.config.timePartitioningField; - const fieldValue = event.data[firestoreFieldName]; + const fieldValue = + event.operation === ChangeType.DELETE + ? event.oldData[firestoreFieldName] + : event.data[firestoreFieldName]; if (!fieldName || !fieldValue) { return {}; diff --git a/firestore-bigquery-export/functions/package-lock.json b/firestore-bigquery-export/functions/package-lock.json index a8e5376e9..01cdad9f9 100644 --- a/firestore-bigquery-export/functions/package-lock.json +++ b/firestore-bigquery-export/functions/package-lock.json @@ -7,7 +7,7 @@ "name": "firestore-bigquery-export", "license": "Apache-2.0", "dependencies": { - "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.41", + "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.42", "@google-cloud/bigquery": "^7.6.0", "@types/chai": "^4.1.6", "@types/express-serve-static-core": "4.17.30", @@ -588,9 +588,9 @@ } }, "node_modules/@firebaseextensions/firestore-bigquery-change-tracker": { - "version": "1.1.41", - "resolved": "https://registry.npmjs.org/@firebaseextensions/firestore-bigquery-change-tracker/-/firestore-bigquery-change-tracker-1.1.41.tgz", - "integrity": "sha512-obyAc5aeOGb1NH/dxGo2Ndu7DHkqpgYyLn7vjwMtiM3rLmCgBuX/O15FZ3K2ytbBGwrhMYg/uY7KwOC1UBxlSg==", + "version": "1.1.42", + "resolved": "https://registry.npmjs.org/@firebaseextensions/firestore-bigquery-change-tracker/-/firestore-bigquery-change-tracker-1.1.42.tgz", + "integrity": "sha512-IdGKcVoLGZOZNlGMGye3AndiobqEK9by/3TgOCy/AUs+YbcCQOFPopkt7Q0jHbj0VHeFHfGV0vg5ngI0tG2TMg==", "dependencies": { "@google-cloud/bigquery": "^7.6.0", "@google-cloud/resource-manager": "^5.1.0", diff --git a/firestore-bigquery-export/functions/package.json b/firestore-bigquery-export/functions/package.json index 1d5a50285..d9f1ec007 100644 --- a/firestore-bigquery-export/functions/package.json +++ b/firestore-bigquery-export/functions/package.json @@ -13,7 +13,7 @@ "author": "Jan Wyszynski ", "license": "Apache-2.0", "dependencies": { - "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.41", + "@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.42", "@google-cloud/bigquery": "^7.6.0", "@types/chai": "^4.1.6", "@types/express-serve-static-core": "4.17.30", diff --git a/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md b/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md index 77c1fa1c7..3c2998955 100644 --- a/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md +++ b/firestore-bigquery-export/guides/EXAMPLE_QUERIES.md @@ -115,6 +115,10 @@ If you want to clean up data from your `changelog` table, use the following `DELETE` query to delete all rows that fall within a certain time period, e.g. greater than 1 month old. +#### Option 1: Remove stale changelog records but keep latest change per document (default) + +If you want to remove all entries that are over one month old, regardless of whether they are the latest change for a document (e.g., including DELETE operations), use the following query: + ```sql /* The query below deletes any rows below that are over one month old. */ DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` @@ -132,3 +136,41 @@ WHERE (document_name, timestamp) IN AND DATETIME(t.timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) ) ``` + +⚠️ Note: This query will remove all entries older than one month, including the most recent record for documents whose last change (e.g., a DELETE) happened more than a month ago. Use this only if you do not need to retain full historical state in your changelog table. + +#### Option 2: Remove all changelog records older than one month — including latest DELETE operations + +If you want to remove all entries that are over one month old, regardless of whether they are the latest change for a document (e.g., including DELETE operations), use the following query: + +```sql +/* Deletes all changelog records older than one month, including latest DELETEs */ +DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` +WHERE DATETIME(timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) +``` + +#### Option 3: Remove all changelog records older than one month, including latest DELETE operations only + +This option removes all old records, and it will also delete DELETE operations even if they are the latest change for a document — as long as they are older than one month. + +Use this if you want to aggressively clean up deleted documents from your changelog, even if that means latest views will no longer reflect that those documents were deleted. + +```sql +/* Deletes any changelog records over one month old, + including DELETEs that are the latest entry for a document */ +DELETE FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` +WHERE (document_name, timestamp) IN ( + WITH latest AS ( + SELECT MAX(timestamp) AS timestamp, document_name + FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` + GROUP BY document_name + ) + SELECT (t.document_name, t.timestamp) + FROM `[PROJECT ID].[DATASET ID].[CHANGELOG TABLE ID]` AS t + JOIN latest ON t.document_name = latest.document_name + WHERE (t.timestamp != latest.timestamp OR t.operation = 'DELETE') + AND DATETIME(t.timestamp) < DATE_ADD(CURRENT_DATETIME(), INTERVAL -1 MONTH) +) +``` + +⚠️ Note: This will remove DELETE records that are older than one month even if they are the most recent change. As a result, your \_latest view will no longer show that those documents were deleted — they may appear as if they never existed. Use this option only if that behavior is acceptable for your use case.