From 60f1ff12d868921f8f405d41fbd28a74875d1380 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Thu, 9 Apr 2026 11:26:53 +0100 Subject: [PATCH 01/25] Add ignore param to createDocuments for silent duplicate handling Adapters: - MariaDB/MySQL: INSERT IGNORE INTO - PostgreSQL: ON CONFLICT DO NOTHING - SQLite: INSERT OR IGNORE INTO - MongoDB: session-scoped pre-filter before insertMany Database.php: - Intra-batch dedup by ID (tenant-aware, first occurrence wins) - Pre-fetch existing IDs to skip known duplicates - Deferred relationship creation for ignore mode (no orphans) - Race-condition reconciliation via _createdAt timestamp verification upsertDocuments: - Batch-fetch existing docs with find() instead of per-row getDocument() - Tenant-aware composite keys for seenIds duplicate check All paths are tenant-per-document aware with null-safe array_filter. --- src/Database/Adapter.php | 3 +- src/Database/Adapter/Mongo.php | 93 ++++++++++++- src/Database/Adapter/Pool.php | 2 +- src/Database/Adapter/Postgres.php | 29 +++++ src/Database/Adapter/SQL.php | 210 +++++++++++++++++++++++++++++- src/Database/Adapter/SQLite.php | 5 + src/Database/Database.php | 194 ++++++++++++++++++++++++--- src/Database/Mirror.php | 3 + 8 files changed, 516 insertions(+), 23 deletions(-) diff --git a/src/Database/Adapter.php b/src/Database/Adapter.php index a7b385cce..9cc83d141 100644 --- a/src/Database/Adapter.php +++ b/src/Database/Adapter.php @@ -729,12 +729,13 @@ abstract public function createDocument(Document $collection, Document $document * * @param Document $collection * @param array $documents + * @param bool $ignore If true, silently ignore duplicate documents instead of throwing * * @return array * * @throws DatabaseException */ - abstract public function createDocuments(Document $collection, array $documents): array; + abstract public function createDocuments(Document $collection, array $documents, bool $ignore = false): array; /** * Update Document diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index 7ddde43d3..71bbc3454 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1460,11 +1460,11 @@ public function castingBefore(Document $collection, Document $document): Documen * @throws DuplicateException * @throws DatabaseException */ - public function createDocuments(Document $collection, array $documents): array + public function createDocuments(Document $collection, array $documents, bool $ignore = false): array { $name = $this->getNamespace() . '_' . $this->filter($collection->getId()); - $options = $this->getTransactionOptions(); + $records = []; $hasSequence = null; $documents = \array_map(fn ($doc) => clone $doc, $documents); @@ -1487,6 +1487,95 @@ public function createDocuments(Document $collection, array $documents): array $records[] = $record; } + // Pre-filter duplicates within the session to avoid aborting the transaction. + if ($ignore && !empty($records)) { + $existingKeys = []; + + try { + if ($this->sharedTables && $this->tenantPerDocument) { + $idsByTenant = []; + foreach ($records as $record) { + $uid = $record['_uid'] ?? ''; + if ($uid === '') { + continue; + } + $tenant = $record['_tenant'] ?? $this->getTenant(); + $idsByTenant[$tenant][] = $uid; + } + + foreach ($idsByTenant as $tenant => $tenantUids) { + $tenantUids = \array_values(\array_unique($tenantUids)); + $findOptions = $this->getTransactionOptions([ + 'projection' => ['_uid' => 1], + 'batchSize' => \count($tenantUids), + ]); + $filters = ['_uid' => ['$in' => $tenantUids], '_tenant' => $tenant]; + $response = $this->client->find($name, $filters, $findOptions); + foreach ($response->cursor->firstBatch ?? [] as $doc) { + $existingKeys[$tenant . ':' . $doc->_uid] = true; + } + $cursorId = $response->cursor->id ?? 0; + while ($cursorId != 0) { + $more = $this->client->getMore($cursorId, $name, \count($tenantUids)); + foreach ($more->cursor->nextBatch ?? [] as $doc) { + $existingKeys[$tenant . ':' . $doc->_uid] = true; + } + $cursorId = $more->cursor->id ?? 0; + } + } + } else { + $uids = \array_filter(\array_map(fn ($r) => $r['_uid'] ?? null, $records), fn ($v) => $v !== null); + if (!empty($uids)) { + $uidValues = \array_values(\array_unique($uids)); + $findOptions = $this->getTransactionOptions([ + 'projection' => ['_uid' => 1], + 'batchSize' => \count($uidValues), + ]); + $filters = ['_uid' => ['$in' => $uidValues]]; + if ($this->sharedTables) { + $filters['_tenant'] = $this->getTenantFilters($collection->getId()); + } + $response = $this->client->find($name, $filters, $findOptions); + foreach ($response->cursor->firstBatch ?? [] as $doc) { + $existingKeys[$doc->_uid] = true; + } + $cursorId = $response->cursor->id ?? 0; + while ($cursorId != 0) { + $more = $this->client->getMore($cursorId, $name, \count($uidValues)); + foreach ($more->cursor->nextBatch ?? [] as $doc) { + $existingKeys[$doc->_uid] = true; + } + $cursorId = $more->cursor->id ?? 0; + } + } + } + } catch (MongoException $e) { + throw $this->processException($e); + } + + if (!empty($existingKeys)) { + $filteredRecords = []; + $filteredDocuments = []; + $tenantPerDoc = $this->sharedTables && $this->tenantPerDocument; + foreach ($records as $i => $record) { + $uid = $record['_uid'] ?? ''; + $key = $tenantPerDoc + ? ($record['_tenant'] ?? $this->getTenant()) . ':' . $uid + : $uid; + if (!isset($existingKeys[$key])) { + $filteredRecords[] = $record; + $filteredDocuments[] = $documents[$i]; + } + } + $records = $filteredRecords; + $documents = $filteredDocuments; + } + + if (empty($records)) { + return []; + } + } + try { $documents = $this->client->insertMany($name, $records, $options); } catch (MongoException $e) { diff --git a/src/Database/Adapter/Pool.php b/src/Database/Adapter/Pool.php index 668753387..ec508c3e4 100644 --- a/src/Database/Adapter/Pool.php +++ b/src/Database/Adapter/Pool.php @@ -268,7 +268,7 @@ public function createDocument(Document $collection, Document $document): Docume return $this->delegate(__FUNCTION__, \func_get_args()); } - public function createDocuments(Document $collection, array $documents): array + public function createDocuments(Document $collection, array $documents, bool $ignore = false): array { return $this->delegate(__FUNCTION__, \func_get_args()); } diff --git a/src/Database/Adapter/Postgres.php b/src/Database/Adapter/Postgres.php index 8dcf72025..814ecc8bc 100644 --- a/src/Database/Adapter/Postgres.php +++ b/src/Database/Adapter/Postgres.php @@ -1365,6 +1365,35 @@ public function updateDocument(Document $collection, string $id, Document $docum return $document; } + protected function getInsertKeyword(bool $ignore): string + { + return 'INSERT INTO'; + } + + protected function getInsertSuffix(bool $ignore, string $table): string + { + if (!$ignore) { + return ''; + } + + $conflictTarget = $this->sharedTables ? '("_uid", "_tenant")' : '("_uid")'; + + return "ON CONFLICT {$conflictTarget} DO NOTHING"; + } + + protected function getInsertPermissionsSuffix(bool $ignore): string + { + if (!$ignore) { + return ''; + } + + $conflictTarget = $this->sharedTables + ? '("_type", "_permission", "_document", "_tenant")' + : '("_type", "_permission", "_document")'; + + return "ON CONFLICT {$conflictTarget} DO NOTHING"; + } + /** * @param string $tableName * @param string $columns diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index 6864e6aee..6c918ef15 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2471,11 +2471,87 @@ protected function execute(mixed $stmt): bool * @throws DuplicateException * @throws \Throwable */ - public function createDocuments(Document $collection, array $documents): array + public function createDocuments(Document $collection, array $documents, bool $ignore = false): array { if (empty($documents)) { return $documents; } + + // Pre-filter existing UIDs to prevent race-condition duplicates. + if ($ignore) { + $collectionId = $collection->getId(); + $name = $this->filter($collectionId); + $uids = \array_filter(\array_map(fn (Document $doc) => $doc->getId(), $documents), fn ($v) => $v !== null); + + if (!empty($uids)) { + try { + $placeholders = []; + $binds = []; + foreach (\array_values(\array_unique($uids)) as $i => $uid) { + $key = ':_dup_uid_' . $i; + $placeholders[] = $key; + $binds[$key] = $uid; + } + + $tenantFilter = ''; + if ($this->sharedTables) { + if ($this->tenantPerDocument) { + $tenants = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getTenant(), $documents), + fn ($v) => $v !== null + ))); + $tenantPlaceholders = []; + foreach ($tenants as $j => $tenant) { + $tKey = ':_dup_tenant_' . $j; + $tenantPlaceholders[] = $tKey; + $binds[$tKey] = $tenant; + } + $tenantFilter = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; + } else { + $tenantFilter = ' AND _tenant = :_dup_tenant'; + $binds[':_dup_tenant'] = $this->getTenant(); + } + } + + $tenantSelect = $this->sharedTables && $this->tenantPerDocument ? ', _tenant' : ''; + $sql = 'SELECT _uid' . $tenantSelect . ' FROM ' . $this->getSQLTable($name) + . ' WHERE _uid IN (' . \implode(', ', $placeholders) . ')' + . $tenantFilter; + + $stmt = $this->getPDO()->prepare($sql); + foreach ($binds as $k => $v) { + $stmt->bindValue($k, $v, $this->getPDOType($v)); + } + $stmt->execute(); + $rows = $stmt->fetchAll(); + $stmt->closeCursor(); + + if ($this->sharedTables && $this->tenantPerDocument) { + $existingKeys = []; + foreach ($rows as $row) { + $existingKeys[$row['_tenant'] . ':' . $row['_uid']] = true; + } + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($existingKeys[$doc->getTenant() . ':' . $doc->getId()]) + )); + } else { + $existingUids = \array_flip(\array_column($rows, '_uid')); + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($existingUids[$doc->getId()]) + )); + } + } catch (PDOException $e) { + throw $this->processException($e); + } + } + + if (empty($documents)) { + return []; + } + } + $spatialAttributes = $this->getSpatialAttributes($collection); $collection = $collection->getId(); try { @@ -2573,8 +2649,9 @@ public function createDocuments(Document $collection, array $documents): array $batchKeys = \implode(', ', $batchKeys); $stmt = $this->getPDO()->prepare(" - INSERT INTO {$this->getSQLTable($name)} {$columns} + {$this->getInsertKeyword($ignore)} {$this->getSQLTable($name)} {$columns} VALUES {$batchKeys} + {$this->getInsertSuffix($ignore, $name)} "); foreach ($bindValues as $key => $value) { @@ -2583,13 +2660,111 @@ public function createDocuments(Document $collection, array $documents): array $this->execute($stmt); + // Reconcile returned docs with actual inserts when a race condition skipped rows. + if ($ignore && $stmt->rowCount() < \count($documents)) { + $expectedTimestamps = []; + foreach ($documents as $doc) { + $eKey = ($this->sharedTables && $this->tenantPerDocument) + ? $doc->getTenant() . ':' . $doc->getId() + : $doc->getId(); + $expectedTimestamps[$eKey] = $doc->getCreatedAt(); + } + + $verifyPlaceholders = []; + $verifyBinds = []; + $rawUids = \array_values(\array_unique(\array_map(fn (Document $doc) => $doc->getId(), $documents))); + foreach ($rawUids as $idx => $uid) { + $ph = ':_vfy_' . $idx; + $verifyPlaceholders[] = $ph; + $verifyBinds[$ph] = $uid; + } + + $tenantWhere = ''; + $tenantSelect = ''; + if ($this->sharedTables) { + if ($this->tenantPerDocument) { + $tenants = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getTenant(), $documents), + fn ($v) => $v !== null + ))); + $tenantPlaceholders = []; + foreach ($tenants as $j => $tenant) { + $tKey = ':_vfy_tenant_' . $j; + $tenantPlaceholders[] = $tKey; + $verifyBinds[$tKey] = $tenant; + } + $tenantWhere = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; + $tenantSelect = ', _tenant'; + } else { + $tenantWhere = ' AND _tenant = :_vfy_tenant'; + $verifyBinds[':_vfy_tenant'] = $this->getTenant(); + } + } + + $verifySql = 'SELECT _uid' . $tenantSelect . ', ' . $this->quote($this->filter('_createdAt')) + . ' FROM ' . $this->getSQLTable($name) + . ' WHERE _uid IN (' . \implode(', ', $verifyPlaceholders) . ')' + . $tenantWhere; + + $verifyStmt = $this->getPDO()->prepare($verifySql); + foreach ($verifyBinds as $k => $v) { + $verifyStmt->bindValue($k, $v, $this->getPDOType($v)); + } + $verifyStmt->execute(); + $rows = $verifyStmt->fetchAll(); + $verifyStmt->closeCursor(); + + // Normalise timestamps — Postgres omits .000 for round seconds. + $normalizeTimestamp = fn (?string $ts): string => $ts !== null + ? (new \DateTime($ts))->format('Y-m-d H:i:s.v') + : ''; + + $actualTimestamps = []; + foreach ($rows as $row) { + $key = ($this->sharedTables && $this->tenantPerDocument) + ? $row['_tenant'] . ':' . $row['_uid'] + : $row['_uid']; + $actualTimestamps[$key] = $normalizeTimestamp($row['_createdAt']); + } + + $insertedDocs = []; + foreach ($documents as $doc) { + $key = ($this->sharedTables && $this->tenantPerDocument) + ? $doc->getTenant() . ':' . $doc->getId() + : $doc->getId(); + if (isset($actualTimestamps[$key]) && $actualTimestamps[$key] === $normalizeTimestamp($expectedTimestamps[$key] ?? null)) { + $insertedDocs[] = $doc; + } + } + $documents = $insertedDocs; + + // Rebuild permissions for actually-inserted docs only + $permissions = []; + $bindValuesPermissions = []; + foreach ($documents as $index => $document) { + foreach (Database::PERMISSIONS as $type) { + foreach ($document->getPermissionsByType($type) as $permission) { + $tenantBind = $this->sharedTables ? ", :_tenant_{$index}" : ''; + $permission = \str_replace('"', '', $permission); + $permission = "('{$type}', '{$permission}', :_uid_{$index} {$tenantBind})"; + $permissions[] = $permission; + $bindValuesPermissions[":_uid_{$index}"] = $document->getId(); + if ($this->sharedTables) { + $bindValuesPermissions[":_tenant_{$index}"] = $document->getTenant(); + } + } + } + } + } + if (!empty($permissions)) { $tenantColumn = $this->sharedTables ? ', _tenant' : ''; $permissions = \implode(', ', $permissions); $sqlPermissions = " - INSERT INTO {$this->getSQLTable($name . '_perms')} (_type, _permission, _document {$tenantColumn}) - VALUES {$permissions}; + {$this->getInsertKeyword($ignore)} {$this->getSQLTable($name . '_perms')} (_type, _permission, _document {$tenantColumn}) + VALUES {$permissions} + {$this->getInsertPermissionsSuffix($ignore)} "; $stmtPermissions = $this->getPDO()->prepare($sqlPermissions); @@ -2608,6 +2783,33 @@ public function createDocuments(Document $collection, array $documents): array return $documents; } + /** + * Returns the INSERT keyword, optionally with IGNORE for duplicate handling. + * Override in adapter subclasses for DB-specific syntax. + */ + protected function getInsertKeyword(bool $ignore): string + { + return $ignore ? 'INSERT IGNORE INTO' : 'INSERT INTO'; + } + + /** + * Returns a suffix appended after VALUES clause for duplicate handling. + * Override in adapter subclasses (e.g., Postgres uses ON CONFLICT DO NOTHING). + */ + protected function getInsertSuffix(bool $ignore, string $table): string + { + return ''; + } + + /** + * Returns a suffix for the permissions INSERT statement when ignoring duplicates. + * Override in adapter subclasses for DB-specific syntax. + */ + protected function getInsertPermissionsSuffix(bool $ignore): string + { + return ''; + } + /** * @param Document $collection * @param string $attribute diff --git a/src/Database/Adapter/SQLite.php b/src/Database/Adapter/SQLite.php index 3c25987eb..8e7ef6b5b 100644 --- a/src/Database/Adapter/SQLite.php +++ b/src/Database/Adapter/SQLite.php @@ -34,6 +34,11 @@ */ class SQLite extends MariaDB { + protected function getInsertKeyword(bool $ignore): string + { + return $ignore ? 'INSERT OR IGNORE INTO' : 'INSERT INTO'; + } + /** * @inheritDoc */ diff --git a/src/Database/Database.php b/src/Database/Database.php index ac58d72f0..cbcc80273 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5621,6 +5621,7 @@ public function createDocument(string $collection, Document $document): Document * @param int $batchSize * @param (callable(Document): void)|null $onNext * @param (callable(Throwable): void)|null $onError + * @param bool $ignore If true, silently ignore duplicate documents instead of throwing * @return int * @throws AuthorizationException * @throws StructureException @@ -5633,6 +5634,7 @@ public function createDocuments( int $batchSize = self::INSERT_BATCH_SIZE, ?callable $onNext = null, ?callable $onError = null, + bool $ignore = false, ): int { if (!$this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument()) { throw new DatabaseException('Shared tables must be enabled if tenant per document is enabled.'); @@ -5653,6 +5655,81 @@ public function createDocuments( $time = DateTime::now(); $modified = 0; + $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); + + // Deduplicate intra-batch documents by ID (tenant-aware). First occurrence wins. + if ($ignore) { + $seenIds = []; + $deduplicated = []; + foreach ($documents as $document) { + $docId = $document->getId(); + if ($docId !== '') { + $dedupeKey = $tenantPerDocument + ? $document->getTenant() . ':' . $docId + : $docId; + if (isset($seenIds[$dedupeKey])) { + continue; + } + $seenIds[$dedupeKey] = true; + } + $deduplicated[] = $document; + } + $documents = $deduplicated; + } + + // Pre-fetch existing IDs to skip relationship writes for known duplicates + $preExistingIds = []; + if ($ignore) { + if ($tenantPerDocument) { + $idsByTenant = []; + foreach ($documents as $doc) { + $idsByTenant[$doc->getTenant()][] = $doc->getId(); + } + foreach ($idsByTenant as $tenant => $tenantIds) { + $tenantIds = \array_values(\array_unique(\array_filter($tenantIds, fn ($v) => $v !== null))); + foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { + $existing = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent(fn () => $this->find( + $collection->getId(), + [ + Query::equal('$id', $idChunk), + Query::select(['$id']), + Query::limit(\count($idChunk)), + ] + )))); + foreach ($existing as $doc) { + $preExistingIds[$tenant . ':' . $doc->getId()] = true; + } + } + } + } else { + $inputIds = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($v) => $v !== null + ))); + + foreach (\array_chunk($inputIds, \max(1, $this->maxQueryValues)) as $idChunk) { + $existing = $this->authorization->skip(fn () => $this->silent(fn () => $this->find( + $collection->getId(), + [ + Query::equal('$id', $idChunk), + Query::select(['$id']), + Query::limit(\count($idChunk)), + ] + ))); + foreach ($existing as $doc) { + $preExistingIds[$doc->getId()] = true; + } + } + } + } + + /** @var array> $deferredRelationships */ + $deferredRelationships = []; + $relationships = []; + if ($ignore && $this->resolveRelationships) { + $relationships = \array_filter($collection->getAttribute('attributes', []), fn ($attr) => $attr['type'] === self::VAR_RELATIONSHIP); + } + foreach ($documents as $document) { $createdAt = $document->getCreatedAt(); $updatedAt = $document->getUpdatedAt(); @@ -5692,18 +5769,70 @@ public function createDocuments( } } - if ($this->resolveRelationships) { - $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); + if ($this->resolveRelationships && !empty($relationships)) { + // Defer: store relationship data, strip attributes for INSERT. + $relationshipData = []; + foreach ($relationships as $rel) { + $key = $rel['key']; + $value = $document->getAttribute($key); + if ($value !== null) { + $relationshipData[$key] = $value; + } + $document->removeAttribute($key); + } + if (!empty($relationshipData)) { + $deferKey = $tenantPerDocument + ? $document->getTenant() . ':' . $document->getId() + : $document->getId(); + $deferredRelationships[$deferKey] = $relationshipData; + } + } elseif ($this->resolveRelationships) { + $preExistKey = $tenantPerDocument + ? $document->getTenant() . ':' . $document->getId() + : $document->getId(); + + if (!isset($preExistingIds[$preExistKey])) { + $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); + } } $document = $this->adapter->castingBefore($collection, $document); } foreach (\array_chunk($documents, $batchSize) as $chunk) { - $batch = $this->withTransaction(function () use ($collection, $chunk) { - return $this->adapter->createDocuments($collection, $chunk); + if ($ignore && !empty($preExistingIds)) { + $chunk = \array_values(\array_filter($chunk, function (Document $doc) use ($preExistingIds, $tenantPerDocument) { + $key = $tenantPerDocument + ? $doc->getTenant() . ':' . $doc->getId() + : $doc->getId(); + return !isset($preExistingIds[$key]); + })); + if (empty($chunk)) { + continue; + } + } + + $batch = $this->withTransaction(function () use ($collection, $chunk, $ignore) { + return $this->adapter->createDocuments($collection, $chunk, $ignore); }); + // Create deferred relationships only for docs that were actually inserted + if ($ignore && $this->resolveRelationships && \count($deferredRelationships) > 0) { + foreach ($batch as $insertedDoc) { + $deferKey = $tenantPerDocument + ? $insertedDoc->getTenant() . ':' . $insertedDoc->getId() + : $insertedDoc->getId(); + if (\array_key_exists($deferKey, $deferredRelationships)) { + $relDoc = clone $insertedDoc; + foreach ($deferredRelationships[$deferKey] as $key => $value) { + $relDoc->setAttribute($key, $value); + } + $this->silent(fn () => $this->createDocumentRelationships($collection, $relDoc)); + unset($deferredRelationships[$deferKey]); + } + } + } + $batch = $this->adapter->getSequences($collection->getId(), $batch); if (!$this->inBatchRelationshipPopulation && $this->resolveRelationships) { @@ -7116,18 +7245,51 @@ public function upsertDocumentsWithIncrease( $created = 0; $updated = 0; $seenIds = []; - foreach ($documents as $key => $document) { - if ($this->getSharedTables() && $this->getTenantPerDocument()) { - $old = $this->authorization->skip(fn () => $this->withTenant($document->getTenant(), fn () => $this->silent(fn () => $this->getDocument( - $collection->getId(), - $document->getId(), - )))); + + // Batch-fetch existing documents in one query instead of N individual getDocument() calls + $ids = \array_filter(\array_map(fn ($doc) => $doc->getId(), $documents), fn ($v) => $v !== null); + $existingDocs = []; + $upsertTenantPerDocument = $this->getSharedTables() && $this->getTenantPerDocument(); + + if (!empty($ids)) { + $uniqueIds = \array_values(\array_unique($ids)); + + if ($upsertTenantPerDocument) { + $idsByTenant = []; + foreach ($documents as $doc) { + $tenant = $doc->getTenant(); + $idsByTenant[$tenant][] = $doc->getId(); + } + foreach ($idsByTenant as $tenant => $tenantIds) { + $tenantIds = \array_values(\array_unique(\array_filter($tenantIds, fn ($v) => $v !== null))); + foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { + $fetched = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent(fn () => $this->find( + $collection->getId(), + [Query::equal('$id', $idChunk), Query::limit(\count($idChunk))], + )))); + foreach ($fetched as $doc) { + $existingDocs[$tenant . ':' . $doc->getId()] = $doc; + } + } + } } else { - $old = $this->authorization->skip(fn () => $this->silent(fn () => $this->getDocument( - $collection->getId(), - $document->getId(), - ))); + foreach (\array_chunk($uniqueIds, \max(1, $this->maxQueryValues)) as $idChunk) { + $fetched = $this->authorization->skip(fn () => $this->silent(fn () => $this->find( + $collection->getId(), + [Query::equal('$id', $idChunk), Query::limit(\count($idChunk))], + ))); + foreach ($fetched as $doc) { + $existingDocs[$doc->getId()] = $doc; + } + } } + } + + foreach ($documents as $key => $document) { + $lookupKey = $upsertTenantPerDocument + ? $document->getTenant() . ':' . $document->getId() + : $document->getId(); + $old = $existingDocs[$lookupKey] ?? new Document(); // Extract operators early to avoid comparison issues $documentArray = $document->getArrayCopy(); @@ -7294,7 +7456,9 @@ public function upsertDocumentsWithIncrease( $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); } - $seenIds[] = $document->getId(); + $seenIds[] = $upsertTenantPerDocument + ? $document->getTenant() . ':' . $document->getId() + : $document->getId(); $old = $this->adapter->castingBefore($collection, $old); $document = $this->adapter->castingBefore($collection, $document); diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index f740cab3e..636d273dd 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -600,6 +600,7 @@ public function createDocuments( int $batchSize = self::INSERT_BATCH_SIZE, ?callable $onNext = null, ?callable $onError = null, + bool $ignore = false, ): int { $modified = $this->source->createDocuments( $collection, @@ -607,6 +608,7 @@ public function createDocuments( $batchSize, $onNext, $onError, + $ignore, ); if ( @@ -645,6 +647,7 @@ public function createDocuments( $collection, $clones, $batchSize, + ignore: $ignore, ) ); From bee42d477cb6aa602ce0df88bd9b77a93874fed0 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Thu, 9 Apr 2026 11:27:17 +0100 Subject: [PATCH 02/25] Add e2e tests for createDocuments ignore mode - testCreateDocumentsIgnoreDuplicates: mixed batch with onNext assertions - testCreateDocumentsIgnoreIntraBatchDuplicates: first-wins, no ACL drift - testCreateDocumentsIgnoreAllDuplicates: zero inserts, empty onNext --- tests/e2e/Adapter/Scopes/DocumentTests.php | 196 +++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/tests/e2e/Adapter/Scopes/DocumentTests.php b/tests/e2e/Adapter/Scopes/DocumentTests.php index d16004d32..34a463200 100644 --- a/tests/e2e/Adapter/Scopes/DocumentTests.php +++ b/tests/e2e/Adapter/Scopes/DocumentTests.php @@ -7722,4 +7722,200 @@ public function testRegexInjection(): void // } // $database->deleteCollection($collectionName); // } + + public function testCreateDocumentsIgnoreDuplicates(): void + { + /** @var Database $database */ + $database = $this->getDatabase(); + + $database->createCollection(__FUNCTION__); + $database->createAttribute(__FUNCTION__, 'name', Database::VAR_STRING, 128, true); + + // Insert initial documents + $database->createDocuments(__FUNCTION__, [ + new Document([ + '$id' => 'doc1', + 'name' => 'Original A', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'doc2', + 'name' => 'Original B', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ]); + + // Without ignore, duplicates should throw + try { + $database->createDocuments(__FUNCTION__, [ + new Document([ + '$id' => 'doc1', + 'name' => 'Duplicate A', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ]); + $this->fail('Expected DuplicateException'); + } catch (DuplicateException $e) { + $this->assertNotEmpty($e->getMessage()); + } + + // With ignore, duplicates should be silently skipped + $emittedIds = []; + $count = $database->createDocuments(__FUNCTION__, [ + new Document([ + '$id' => 'doc1', + 'name' => 'Duplicate A', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'doc3', + 'name' => 'New C', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }, ignore: true); + + // Only doc3 was new, doc1 was skipped as duplicate + $this->assertSame(1, $count); + $this->assertCount(1, $emittedIds); + $this->assertSame('doc3', $emittedIds[0]); + + // doc3 should exist, doc1 should retain original value + $doc1 = $database->getDocument(__FUNCTION__, 'doc1'); + $this->assertSame('Original A', $doc1->getAttribute('name')); + + $doc3 = $database->getDocument(__FUNCTION__, 'doc3'); + $this->assertSame('New C', $doc3->getAttribute('name')); + + // Total should be 3 (doc1, doc2, doc3) + $all = $database->find(__FUNCTION__); + $this->assertCount(3, $all); + } + + public function testCreateDocumentsIgnoreIntraBatchDuplicates(): void + { + /** @var Database $database */ + $database = $this->getDatabase(); + $col = 'createDocsIgnoreIntraBatch'; + + $database->createCollection($col); + $database->createAttribute($col, 'name', Database::VAR_STRING, 128, true); + + // Two docs with same ID in one batch — first wins, second is deduplicated + $emittedIds = []; + $count = $database->createDocuments($col, [ + new Document([ + '$id' => 'dup', + 'name' => 'First', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'dup', + 'name' => 'Second', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + Permission::update(Role::user('extra')), + ], + ]), + new Document([ + '$id' => 'unique1', + 'name' => 'Unique', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }, ignore: true); + + $this->assertSame(2, $count); + $this->assertCount(2, $emittedIds); + + // First occurrence wins + $doc = $database->getDocument($col, 'dup'); + $this->assertSame('First', $doc->getAttribute('name')); + + // Second doc's extra permission should NOT exist (no ACL drift) + $perms = $doc->getPermissions(); + foreach ($perms as $perm) { + $this->assertStringNotContainsString('extra', $perm); + } + + // unique1 should exist + $unique = $database->getDocument($col, 'unique1'); + $this->assertSame('Unique', $unique->getAttribute('name')); + + // Total: 2 documents + $all = $database->find($col); + $this->assertCount(2, $all); + } + + public function testCreateDocumentsIgnoreAllDuplicates(): void + { + /** @var Database $database */ + $database = $this->getDatabase(); + + $database->createCollection(__FUNCTION__); + $database->createAttribute(__FUNCTION__, 'name', Database::VAR_STRING, 128, true); + + // Insert initial document + $database->createDocuments(__FUNCTION__, [ + new Document([ + '$id' => 'existing', + 'name' => 'Original', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ]); + + // With ignore, inserting only duplicates should succeed with no new rows + $emittedIds = []; + $count = $database->createDocuments(__FUNCTION__, [ + new Document([ + '$id' => 'existing', + 'name' => 'Duplicate', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }, ignore: true); + + // All duplicates skipped, nothing inserted + $this->assertSame(0, $count); + $this->assertSame([], $emittedIds); + + // Original document should be unchanged + $doc = $database->getDocument(__FUNCTION__, 'existing'); + $this->assertSame('Original', $doc->getAttribute('name')); + + // Still only 1 document + $all = $database->find(__FUNCTION__); + $this->assertCount(1, $all); + } } From 93a9136fa02e1b22b3108b9f2db9d07a936e3115 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Fri, 10 Apr 2026 07:50:16 +0100 Subject: [PATCH 03/25] Fix Mongo adapter ignore mode: pass ignoreDuplicates to client and fix _uid mapping - Pass ignoreDuplicates option to insertMany for race-condition safety - Extract _uid from raw array before replaceChars (excluded from transformation) - Map inserted records back to original Document objects by _uid --- src/Database/Adapter/Mongo.php | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index 71bbc3454..d82e8f8cb 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1576,18 +1576,37 @@ public function createDocuments(Document $collection, array $documents, bool $ig } } + if ($ignore) { + $options['ignoreDuplicates'] = true; + } + try { - $documents = $this->client->insertMany($name, $records, $options); + $inserted = $this->client->insertMany($name, $records, $options); } catch (MongoException $e) { throw $this->processException($e); } - foreach ($documents as $index => $document) { - $documents[$index] = $this->replaceChars('_', '$', $this->client->toArray($document)); - $documents[$index] = new Document($documents[$index]); + $insertedUids = []; + foreach ($inserted as $record) { + $arr = $this->client->toArray($record); + // _uid is excluded from replaceChars transformation, so extract it before conversion + $uid = $arr['_uid'] ?? ''; + $arr = $this->replaceChars('_', '$', $arr); + $insertedUids[$uid] = new Document($arr); } - return $documents; + if ($ignore) { + $result = []; + foreach ($records as $i => $record) { + $uid = $record['_uid'] ?? ''; + if (isset($insertedUids[$uid])) { + $result[] = $documents[$i]; + } + } + return $result; + } + + return \array_values($insertedUids); } /** From 2906ddafadf4f734b4bd6fdbccbf262c9f1ac10d Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Fri, 10 Apr 2026 08:00:41 +0100 Subject: [PATCH 04/25] Revert "Fix Mongo adapter ignore mode: pass ignoreDuplicates to client and fix _uid mapping" This reverts commit 93a9136fa02e1b22b3108b9f2db9d07a936e3115. --- src/Database/Adapter/Mongo.php | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index d82e8f8cb..71bbc3454 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1576,37 +1576,18 @@ public function createDocuments(Document $collection, array $documents, bool $ig } } - if ($ignore) { - $options['ignoreDuplicates'] = true; - } - try { - $inserted = $this->client->insertMany($name, $records, $options); + $documents = $this->client->insertMany($name, $records, $options); } catch (MongoException $e) { throw $this->processException($e); } - $insertedUids = []; - foreach ($inserted as $record) { - $arr = $this->client->toArray($record); - // _uid is excluded from replaceChars transformation, so extract it before conversion - $uid = $arr['_uid'] ?? ''; - $arr = $this->replaceChars('_', '$', $arr); - $insertedUids[$uid] = new Document($arr); - } - - if ($ignore) { - $result = []; - foreach ($records as $i => $record) { - $uid = $record['_uid'] ?? ''; - if (isset($insertedUids[$uid])) { - $result[] = $documents[$i]; - } - } - return $result; + foreach ($documents as $index => $document) { + $documents[$index] = $this->replaceChars('_', '$', $this->client->toArray($document)); + $documents[$index] = new Document($documents[$index]); } - return \array_values($insertedUids); + return $documents; } /** From 63d9902530f272d6863960dfbc17089519deebb4 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Fri, 10 Apr 2026 13:45:47 +0100 Subject: [PATCH 05/25] Replace ignore param with skipDuplicates scope guard - Add skipDuplicates(callable) scope guard on Database, following existing pattern (skipRelationships, skipValidation, etc.) - Remove bool $ignore parameter from createDocuments signature - Mirror propagates skipDuplicates state to source and destination - Update tests to use $database->skipDuplicates(function() { ... }) --- src/Database/Database.php | 17 ++- src/Database/Mirror.php | 15 ++- tests/e2e/Adapter/Scopes/DocumentTests.php | 134 +++++++++++---------- 3 files changed, 96 insertions(+), 70 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index cbcc80273..78a9c5a57 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -417,6 +417,8 @@ class Database protected bool $preserveDates = false; + protected bool $skipDuplicates = false; + protected bool $preserveSequence = false; protected int $maxQueryValues = 5000; @@ -842,6 +844,18 @@ public function skipRelationshipsExistCheck(callable $callback): mixed } } + public function skipDuplicates(callable $callback): mixed + { + $previous = $this->skipDuplicates; + $this->skipDuplicates = true; + + try { + return $callback(); + } finally { + $this->skipDuplicates = $previous; + } + } + /** * Trigger callback for events * @@ -5621,7 +5635,6 @@ public function createDocument(string $collection, Document $document): Document * @param int $batchSize * @param (callable(Document): void)|null $onNext * @param (callable(Throwable): void)|null $onError - * @param bool $ignore If true, silently ignore duplicate documents instead of throwing * @return int * @throws AuthorizationException * @throws StructureException @@ -5634,7 +5647,6 @@ public function createDocuments( int $batchSize = self::INSERT_BATCH_SIZE, ?callable $onNext = null, ?callable $onError = null, - bool $ignore = false, ): int { if (!$this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument()) { throw new DatabaseException('Shared tables must be enabled if tenant per document is enabled.'); @@ -5656,6 +5668,7 @@ public function createDocuments( $modified = 0; $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); + $ignore = $this->skipDuplicates; // Deduplicate intra-batch documents by ID (tenant-aware). First occurrence wins. if ($ignore) { diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 636d273dd..5f3ae640c 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -600,17 +600,19 @@ public function createDocuments( int $batchSize = self::INSERT_BATCH_SIZE, ?callable $onNext = null, ?callable $onError = null, - bool $ignore = false, ): int { - $modified = $this->source->createDocuments( + $createFn = fn () => $this->source->createDocuments( $collection, $documents, $batchSize, $onNext, $onError, - $ignore, ); + $modified = $this->skipDuplicates + ? $this->source->skipDuplicates($createFn) + : $createFn(); + if ( \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) || $this->destination === null @@ -641,16 +643,19 @@ public function createDocuments( $clones[] = $clone; } - $this->destination->withPreserveDates( + $destFn = fn () => $this->destination->withPreserveDates( fn () => $this->destination->createDocuments( $collection, $clones, $batchSize, - ignore: $ignore, ) ); + $this->skipDuplicates + ? $this->destination->skipDuplicates($destFn) + : $destFn(); + foreach ($clones as $clone) { foreach ($this->writeFilters as $filter) { $filter->afterCreateDocument( diff --git a/tests/e2e/Adapter/Scopes/DocumentTests.php b/tests/e2e/Adapter/Scopes/DocumentTests.php index 34a463200..5f82cc025 100644 --- a/tests/e2e/Adapter/Scopes/DocumentTests.php +++ b/tests/e2e/Adapter/Scopes/DocumentTests.php @@ -7768,28 +7768,31 @@ public function testCreateDocumentsIgnoreDuplicates(): void $this->assertNotEmpty($e->getMessage()); } - // With ignore, duplicates should be silently skipped + // With skipDuplicates, duplicates should be silently skipped $emittedIds = []; - $count = $database->createDocuments(__FUNCTION__, [ - new Document([ - '$id' => 'doc1', - 'name' => 'Duplicate A', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - new Document([ - '$id' => 'doc3', - 'name' => 'New C', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - ], onNext: function (Document $doc) use (&$emittedIds) { - $emittedIds[] = $doc->getId(); - }, ignore: true); + $collection = __FUNCTION__; + $count = $database->skipDuplicates(function () use ($database, $collection, &$emittedIds) { + return $database->createDocuments($collection, [ + new Document([ + '$id' => 'doc1', + 'name' => 'Duplicate A', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'doc3', + 'name' => 'New C', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }); + }); // Only doc3 was new, doc1 was skipped as duplicate $this->assertSame(1, $count); @@ -7819,35 +7822,37 @@ public function testCreateDocumentsIgnoreIntraBatchDuplicates(): void // Two docs with same ID in one batch — first wins, second is deduplicated $emittedIds = []; - $count = $database->createDocuments($col, [ - new Document([ - '$id' => 'dup', - 'name' => 'First', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - new Document([ - '$id' => 'dup', - 'name' => 'Second', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - Permission::update(Role::user('extra')), - ], - ]), - new Document([ - '$id' => 'unique1', - 'name' => 'Unique', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - ], onNext: function (Document $doc) use (&$emittedIds) { - $emittedIds[] = $doc->getId(); - }, ignore: true); + $count = $database->skipDuplicates(function () use ($database, $col, &$emittedIds) { + return $database->createDocuments($col, [ + new Document([ + '$id' => 'dup', + 'name' => 'First', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'dup', + 'name' => 'Second', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + Permission::update(Role::user('extra')), + ], + ]), + new Document([ + '$id' => 'unique1', + 'name' => 'Unique', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }); + }); $this->assertSame(2, $count); $this->assertCount(2, $emittedIds); @@ -7891,20 +7896,23 @@ public function testCreateDocumentsIgnoreAllDuplicates(): void ]), ]); - // With ignore, inserting only duplicates should succeed with no new rows + // With skipDuplicates, inserting only duplicates should succeed with no new rows $emittedIds = []; - $count = $database->createDocuments(__FUNCTION__, [ - new Document([ - '$id' => 'existing', - 'name' => 'Duplicate', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - ], onNext: function (Document $doc) use (&$emittedIds) { - $emittedIds[] = $doc->getId(); - }, ignore: true); + $collection = __FUNCTION__; + $count = $database->skipDuplicates(function () use ($database, $collection, &$emittedIds) { + return $database->createDocuments($collection, [ + new Document([ + '$id' => 'existing', + 'name' => 'Duplicate', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ], onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }); + }); // All duplicates skipped, nothing inserted $this->assertSame(0, $count); From b0a8392d2c8e90c5ca0b49ed24758a36b13f7a0c Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Sun, 12 Apr 2026 11:43:46 +0100 Subject: [PATCH 06/25] Push skipDuplicates scope guard down to Adapter layer - Add $skipDuplicates property + skipDuplicates() scope guard to Adapter - Remove bool $ignore parameter from all adapter createDocuments signatures (Adapter, Pool, Mongo, SQL) - Remove bool $ignore from SQL helper methods (getInsertKeyword, getInsertSuffix, getInsertPermissionsSuffix) and Postgres/SQLite overrides - Pool delegate() and withTransaction() propagate skipDuplicates state to pooled/pinned adapter via the scope guard - Database::createDocuments() wraps adapter call in adapter->skipDuplicates() when the flag is set, drops the local $ignore variable --- src/Database/Adapter.php | 26 ++++++++++++++++++++++++-- src/Database/Adapter/Mongo.php | 4 ++-- src/Database/Adapter/Pool.php | 11 +++++++---- src/Database/Adapter/Postgres.php | 10 +++++----- src/Database/Adapter/SQL.php | 22 +++++++++++----------- src/Database/Adapter/SQLite.php | 4 ++-- src/Database/Database.php | 18 ++++++++++-------- 7 files changed, 61 insertions(+), 34 deletions(-) diff --git a/src/Database/Adapter.php b/src/Database/Adapter.php index 9cc83d141..26fe8249c 100644 --- a/src/Database/Adapter.php +++ b/src/Database/Adapter.php @@ -33,6 +33,29 @@ abstract class Adapter protected bool $alterLocks = false; + protected bool $skipDuplicates = false; + + /** + * Run a callback with skipDuplicates enabled. + * Duplicate key errors during createDocuments() will be silently skipped + * instead of thrown. Nestable — saves and restores previous state. + * + * @template T + * @param callable(): T $callback + * @return T + */ + public function skipDuplicates(callable $callback): mixed + { + $previous = $this->skipDuplicates; + $this->skipDuplicates = true; + + try { + return $callback(); + } finally { + $this->skipDuplicates = $previous; + } + } + /** * @var array */ @@ -729,13 +752,12 @@ abstract public function createDocument(Document $collection, Document $document * * @param Document $collection * @param array $documents - * @param bool $ignore If true, silently ignore duplicate documents instead of throwing * * @return array * * @throws DatabaseException */ - abstract public function createDocuments(Document $collection, array $documents, bool $ignore = false): array; + abstract public function createDocuments(Document $collection, array $documents): array; /** * Update Document diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index 71bbc3454..c788aebba 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1460,7 +1460,7 @@ public function castingBefore(Document $collection, Document $document): Documen * @throws DuplicateException * @throws DatabaseException */ - public function createDocuments(Document $collection, array $documents, bool $ignore = false): array + public function createDocuments(Document $collection, array $documents): array { $name = $this->getNamespace() . '_' . $this->filter($collection->getId()); $options = $this->getTransactionOptions(); @@ -1488,7 +1488,7 @@ public function createDocuments(Document $collection, array $documents, bool $ig } // Pre-filter duplicates within the session to avoid aborting the transaction. - if ($ignore && !empty($records)) { + if ($this->skipDuplicates && !empty($records)) { $existingKeys = []; try { diff --git a/src/Database/Adapter/Pool.php b/src/Database/Adapter/Pool.php index ec508c3e4..10ae1de90 100644 --- a/src/Database/Adapter/Pool.php +++ b/src/Database/Adapter/Pool.php @@ -43,7 +43,8 @@ public function __construct(UtopiaPool $pool) public function delegate(string $method, array $args): mixed { if ($this->pinnedAdapter !== null) { - return $this->pinnedAdapter->{$method}(...$args); + $invoke = fn () => $this->pinnedAdapter->{$method}(...$args); + return $this->skipDuplicates ? $this->pinnedAdapter->skipDuplicates($invoke) : $invoke(); } return $this->pool->use(function (Adapter $adapter) use ($method, $args) { @@ -66,7 +67,8 @@ public function delegate(string $method, array $args): mixed $adapter->setMetadata($key, $value); } - return $adapter->{$method}(...$args); + $invoke = fn () => $adapter->{$method}(...$args); + return $this->skipDuplicates ? $adapter->skipDuplicates($invoke) : $invoke(); }); } @@ -146,7 +148,8 @@ public function withTransaction(callable $callback): mixed $this->pinnedAdapter = $adapter; try { - return $adapter->withTransaction($callback); + $invoke = fn () => $adapter->withTransaction($callback); + return $this->skipDuplicates ? $adapter->skipDuplicates($invoke) : $invoke(); } finally { $this->pinnedAdapter = null; } @@ -268,7 +271,7 @@ public function createDocument(Document $collection, Document $document): Docume return $this->delegate(__FUNCTION__, \func_get_args()); } - public function createDocuments(Document $collection, array $documents, bool $ignore = false): array + public function createDocuments(Document $collection, array $documents): array { return $this->delegate(__FUNCTION__, \func_get_args()); } diff --git a/src/Database/Adapter/Postgres.php b/src/Database/Adapter/Postgres.php index 814ecc8bc..03036dd2f 100644 --- a/src/Database/Adapter/Postgres.php +++ b/src/Database/Adapter/Postgres.php @@ -1365,14 +1365,14 @@ public function updateDocument(Document $collection, string $id, Document $docum return $document; } - protected function getInsertKeyword(bool $ignore): string + protected function getInsertKeyword(): string { return 'INSERT INTO'; } - protected function getInsertSuffix(bool $ignore, string $table): string + protected function getInsertSuffix(string $table): string { - if (!$ignore) { + if (!$this->skipDuplicates) { return ''; } @@ -1381,9 +1381,9 @@ protected function getInsertSuffix(bool $ignore, string $table): string return "ON CONFLICT {$conflictTarget} DO NOTHING"; } - protected function getInsertPermissionsSuffix(bool $ignore): string + protected function getInsertPermissionsSuffix(): string { - if (!$ignore) { + if (!$this->skipDuplicates) { return ''; } diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index 6c918ef15..4fb58ae3f 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2471,14 +2471,14 @@ protected function execute(mixed $stmt): bool * @throws DuplicateException * @throws \Throwable */ - public function createDocuments(Document $collection, array $documents, bool $ignore = false): array + public function createDocuments(Document $collection, array $documents): array { if (empty($documents)) { return $documents; } // Pre-filter existing UIDs to prevent race-condition duplicates. - if ($ignore) { + if ($this->skipDuplicates) { $collectionId = $collection->getId(); $name = $this->filter($collectionId); $uids = \array_filter(\array_map(fn (Document $doc) => $doc->getId(), $documents), fn ($v) => $v !== null); @@ -2649,9 +2649,9 @@ public function createDocuments(Document $collection, array $documents, bool $ig $batchKeys = \implode(', ', $batchKeys); $stmt = $this->getPDO()->prepare(" - {$this->getInsertKeyword($ignore)} {$this->getSQLTable($name)} {$columns} + {$this->getInsertKeyword()} {$this->getSQLTable($name)} {$columns} VALUES {$batchKeys} - {$this->getInsertSuffix($ignore, $name)} + {$this->getInsertSuffix($name)} "); foreach ($bindValues as $key => $value) { @@ -2661,7 +2661,7 @@ public function createDocuments(Document $collection, array $documents, bool $ig $this->execute($stmt); // Reconcile returned docs with actual inserts when a race condition skipped rows. - if ($ignore && $stmt->rowCount() < \count($documents)) { + if ($this->skipDuplicates && $stmt->rowCount() < \count($documents)) { $expectedTimestamps = []; foreach ($documents as $doc) { $eKey = ($this->sharedTables && $this->tenantPerDocument) @@ -2762,9 +2762,9 @@ public function createDocuments(Document $collection, array $documents, bool $ig $permissions = \implode(', ', $permissions); $sqlPermissions = " - {$this->getInsertKeyword($ignore)} {$this->getSQLTable($name . '_perms')} (_type, _permission, _document {$tenantColumn}) + {$this->getInsertKeyword()} {$this->getSQLTable($name . '_perms')} (_type, _permission, _document {$tenantColumn}) VALUES {$permissions} - {$this->getInsertPermissionsSuffix($ignore)} + {$this->getInsertPermissionsSuffix()} "; $stmtPermissions = $this->getPDO()->prepare($sqlPermissions); @@ -2787,16 +2787,16 @@ public function createDocuments(Document $collection, array $documents, bool $ig * Returns the INSERT keyword, optionally with IGNORE for duplicate handling. * Override in adapter subclasses for DB-specific syntax. */ - protected function getInsertKeyword(bool $ignore): string + protected function getInsertKeyword(): string { - return $ignore ? 'INSERT IGNORE INTO' : 'INSERT INTO'; + return $this->skipDuplicates ? 'INSERT IGNORE INTO' : 'INSERT INTO'; } /** * Returns a suffix appended after VALUES clause for duplicate handling. * Override in adapter subclasses (e.g., Postgres uses ON CONFLICT DO NOTHING). */ - protected function getInsertSuffix(bool $ignore, string $table): string + protected function getInsertSuffix(string $table): string { return ''; } @@ -2805,7 +2805,7 @@ protected function getInsertSuffix(bool $ignore, string $table): string * Returns a suffix for the permissions INSERT statement when ignoring duplicates. * Override in adapter subclasses for DB-specific syntax. */ - protected function getInsertPermissionsSuffix(bool $ignore): string + protected function getInsertPermissionsSuffix(): string { return ''; } diff --git a/src/Database/Adapter/SQLite.php b/src/Database/Adapter/SQLite.php index 8e7ef6b5b..2732f784b 100644 --- a/src/Database/Adapter/SQLite.php +++ b/src/Database/Adapter/SQLite.php @@ -34,9 +34,9 @@ */ class SQLite extends MariaDB { - protected function getInsertKeyword(bool $ignore): string + protected function getInsertKeyword(): string { - return $ignore ? 'INSERT OR IGNORE INTO' : 'INSERT INTO'; + return $this->skipDuplicates ? 'INSERT OR IGNORE INTO' : 'INSERT INTO'; } /** diff --git a/src/Database/Database.php b/src/Database/Database.php index 78a9c5a57..146ceacaf 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5668,10 +5668,9 @@ public function createDocuments( $modified = 0; $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); - $ignore = $this->skipDuplicates; // Deduplicate intra-batch documents by ID (tenant-aware). First occurrence wins. - if ($ignore) { + if ($this->skipDuplicates) { $seenIds = []; $deduplicated = []; foreach ($documents as $document) { @@ -5692,7 +5691,7 @@ public function createDocuments( // Pre-fetch existing IDs to skip relationship writes for known duplicates $preExistingIds = []; - if ($ignore) { + if ($this->skipDuplicates) { if ($tenantPerDocument) { $idsByTenant = []; foreach ($documents as $doc) { @@ -5739,7 +5738,7 @@ public function createDocuments( /** @var array> $deferredRelationships */ $deferredRelationships = []; $relationships = []; - if ($ignore && $this->resolveRelationships) { + if ($this->skipDuplicates && $this->resolveRelationships) { $relationships = \array_filter($collection->getAttribute('attributes', []), fn ($attr) => $attr['type'] === self::VAR_RELATIONSHIP); } @@ -5813,7 +5812,7 @@ public function createDocuments( } foreach (\array_chunk($documents, $batchSize) as $chunk) { - if ($ignore && !empty($preExistingIds)) { + if ($this->skipDuplicates && !empty($preExistingIds)) { $chunk = \array_values(\array_filter($chunk, function (Document $doc) use ($preExistingIds, $tenantPerDocument) { $key = $tenantPerDocument ? $doc->getTenant() . ':' . $doc->getId() @@ -5825,12 +5824,15 @@ public function createDocuments( } } - $batch = $this->withTransaction(function () use ($collection, $chunk, $ignore) { - return $this->adapter->createDocuments($collection, $chunk, $ignore); + $batch = $this->withTransaction(function () use ($collection, $chunk) { + $createFn = fn () => $this->adapter->createDocuments($collection, $chunk); + return $this->skipDuplicates + ? $this->adapter->skipDuplicates($createFn) + : $createFn(); }); // Create deferred relationships only for docs that were actually inserted - if ($ignore && $this->resolveRelationships && \count($deferredRelationships) > 0) { + if ($this->skipDuplicates && $this->resolveRelationships && \count($deferredRelationships) > 0) { foreach ($batch as $insertedDoc) { $deferKey = $tenantPerDocument ? $insertedDoc->getTenant() . ':' . $insertedDoc->getId() From d8f647c33c3bd3073c210f4c9055319c2efb70e4 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Sun, 12 Apr 2026 14:32:58 +0100 Subject: [PATCH 07/25] Refactor: extract helpers and collapse conditional wraps - skipDuplicates() takes optional $enable so Pool/Mirror/Database can forward state in a single call instead of branching - Database: tenantKey(Document) helper replaces ~8 inline ternaries - Database: fetchExistingByIds() helper replaces the tenant-grouped batched find() in createDocuments and upsertDocuments pre-fetch - Database: drop dead guard on non-defer relationship path - Mongo: findExistingUids() helper replaces the duplicated cursor walk in the skipDuplicates pre-filter (tenant and non-tenant variants) - SQL: buildUidTenantLookup() helper replaces the duplicated placeholder-and-binds builder shared by pre-filter and reconciliation --- src/Database/Adapter.php | 9 +- src/Database/Adapter/Mongo.php | 97 ++++++++------ src/Database/Adapter/Pool.php | 18 ++- src/Database/Adapter/SQL.php | 136 ++++++++++---------- src/Database/Database.php | 225 +++++++++++++++------------------ src/Database/Mirror.php | 39 +++--- 6 files changed, 259 insertions(+), 265 deletions(-) diff --git a/src/Database/Adapter.php b/src/Database/Adapter.php index 26fe8249c..757d0c464 100644 --- a/src/Database/Adapter.php +++ b/src/Database/Adapter.php @@ -39,13 +39,20 @@ abstract class Adapter * Run a callback with skipDuplicates enabled. * Duplicate key errors during createDocuments() will be silently skipped * instead of thrown. Nestable — saves and restores previous state. + * Pass $enable = false to run the callback without toggling the flag + * (useful for conditional forwarding from wrappers like Pool/Mirror). * * @template T * @param callable(): T $callback + * @param bool $enable * @return T */ - public function skipDuplicates(callable $callback): mixed + public function skipDuplicates(callable $callback, bool $enable = true): mixed { + if (!$enable) { + return $callback(); + } + $previous = $this->skipDuplicates; $this->skipDuplicates = true; diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index c788aebba..3f6bbc354 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1449,6 +1449,42 @@ public function castingBefore(Document $collection, Document $document): Documen return $document; } + /** + * Find existing `_uid` values matching the given filters, fully walking the cursor. + * + * @param string $name + * @param array $filters + * @param int $batchSize Hint: expected max result count — lets MongoDB return everything in one batch + * @return array + * + * @throws MongoException + */ + private function findExistingUids(string $name, array $filters, int $batchSize): array + { + $options = $this->getTransactionOptions([ + 'projection' => ['_uid' => 1], + 'batchSize' => $batchSize, + ]); + + $response = $this->client->find($name, $filters, $options); + $uids = []; + + foreach ($response->cursor->firstBatch ?? [] as $doc) { + $uids[] = $doc->_uid; + } + + $cursorId = $response->cursor->id ?? 0; + while ($cursorId != 0) { + $more = $this->client->getMore($cursorId, $name, $batchSize); + foreach ($more->cursor->nextBatch ?? [] as $doc) { + $uids[] = $doc->_uid; + } + $cursorId = $more->cursor->id ?? 0; + } + + return $uids; + } + /** * Create Documents in batches * @@ -1463,8 +1499,8 @@ public function castingBefore(Document $collection, Document $document): Documen public function createDocuments(Document $collection, array $documents): array { $name = $this->getNamespace() . '_' . $this->filter($collection->getId()); - $options = $this->getTransactionOptions(); + $options = $this->getTransactionOptions(); $records = []; $hasSequence = null; $documents = \array_map(fn ($doc) => clone $doc, $documents); @@ -1487,12 +1523,18 @@ public function createDocuments(Document $collection, array $documents): array $records[] = $record; } - // Pre-filter duplicates within the session to avoid aborting the transaction. + // Pre-filter duplicates within the session — MongoDB has no INSERT IGNORE, + // so sending a duplicate would abort the entire transaction. if ($this->skipDuplicates && !empty($records)) { + $tenantPerDoc = $this->sharedTables && $this->tenantPerDocument; + $recordKey = fn (array $record): string => $tenantPerDoc + ? ($record['_tenant'] ?? $this->getTenant()) . ':' . ($record['_uid'] ?? '') + : ($record['_uid'] ?? ''); + $existingKeys = []; try { - if ($this->sharedTables && $this->tenantPerDocument) { + if ($tenantPerDoc) { $idsByTenant = []; foreach ($records as $record) { $uid = $record['_uid'] ?? ''; @@ -1505,47 +1547,23 @@ public function createDocuments(Document $collection, array $documents): array foreach ($idsByTenant as $tenant => $tenantUids) { $tenantUids = \array_values(\array_unique($tenantUids)); - $findOptions = $this->getTransactionOptions([ - 'projection' => ['_uid' => 1], - 'batchSize' => \count($tenantUids), - ]); $filters = ['_uid' => ['$in' => $tenantUids], '_tenant' => $tenant]; - $response = $this->client->find($name, $filters, $findOptions); - foreach ($response->cursor->firstBatch ?? [] as $doc) { - $existingKeys[$tenant . ':' . $doc->_uid] = true; - } - $cursorId = $response->cursor->id ?? 0; - while ($cursorId != 0) { - $more = $this->client->getMore($cursorId, $name, \count($tenantUids)); - foreach ($more->cursor->nextBatch ?? [] as $doc) { - $existingKeys[$tenant . ':' . $doc->_uid] = true; - } - $cursorId = $more->cursor->id ?? 0; + foreach ($this->findExistingUids($name, $filters, \count($tenantUids)) as $uid) { + $existingKeys[$tenant . ':' . $uid] = true; } } } else { - $uids = \array_filter(\array_map(fn ($r) => $r['_uid'] ?? null, $records), fn ($v) => $v !== null); + $uids = \array_values(\array_unique(\array_filter( + \array_map(fn ($r) => $r['_uid'] ?? null, $records), + fn ($v) => $v !== null + ))); if (!empty($uids)) { - $uidValues = \array_values(\array_unique($uids)); - $findOptions = $this->getTransactionOptions([ - 'projection' => ['_uid' => 1], - 'batchSize' => \count($uidValues), - ]); - $filters = ['_uid' => ['$in' => $uidValues]]; + $filters = ['_uid' => ['$in' => $uids]]; if ($this->sharedTables) { $filters['_tenant'] = $this->getTenantFilters($collection->getId()); } - $response = $this->client->find($name, $filters, $findOptions); - foreach ($response->cursor->firstBatch ?? [] as $doc) { - $existingKeys[$doc->_uid] = true; - } - $cursorId = $response->cursor->id ?? 0; - while ($cursorId != 0) { - $more = $this->client->getMore($cursorId, $name, \count($uidValues)); - foreach ($more->cursor->nextBatch ?? [] as $doc) { - $existingKeys[$doc->_uid] = true; - } - $cursorId = $more->cursor->id ?? 0; + foreach ($this->findExistingUids($name, $filters, \count($uids)) as $uid) { + $existingKeys[$uid] = true; } } } @@ -1556,13 +1574,8 @@ public function createDocuments(Document $collection, array $documents): array if (!empty($existingKeys)) { $filteredRecords = []; $filteredDocuments = []; - $tenantPerDoc = $this->sharedTables && $this->tenantPerDocument; foreach ($records as $i => $record) { - $uid = $record['_uid'] ?? ''; - $key = $tenantPerDoc - ? ($record['_tenant'] ?? $this->getTenant()) . ':' . $uid - : $uid; - if (!isset($existingKeys[$key])) { + if (!isset($existingKeys[$recordKey($record)])) { $filteredRecords[] = $record; $filteredDocuments[] = $documents[$i]; } diff --git a/src/Database/Adapter/Pool.php b/src/Database/Adapter/Pool.php index 10ae1de90..648372402 100644 --- a/src/Database/Adapter/Pool.php +++ b/src/Database/Adapter/Pool.php @@ -43,8 +43,10 @@ public function __construct(UtopiaPool $pool) public function delegate(string $method, array $args): mixed { if ($this->pinnedAdapter !== null) { - $invoke = fn () => $this->pinnedAdapter->{$method}(...$args); - return $this->skipDuplicates ? $this->pinnedAdapter->skipDuplicates($invoke) : $invoke(); + return $this->pinnedAdapter->skipDuplicates( + fn () => $this->pinnedAdapter->{$method}(...$args), + $this->skipDuplicates + ); } return $this->pool->use(function (Adapter $adapter) use ($method, $args) { @@ -67,8 +69,10 @@ public function delegate(string $method, array $args): mixed $adapter->setMetadata($key, $value); } - $invoke = fn () => $adapter->{$method}(...$args); - return $this->skipDuplicates ? $adapter->skipDuplicates($invoke) : $invoke(); + return $adapter->skipDuplicates( + fn () => $adapter->{$method}(...$args), + $this->skipDuplicates + ); }); } @@ -148,8 +152,10 @@ public function withTransaction(callable $callback): mixed $this->pinnedAdapter = $adapter; try { - $invoke = fn () => $adapter->withTransaction($callback); - return $this->skipDuplicates ? $adapter->skipDuplicates($invoke) : $invoke(); + return $adapter->skipDuplicates( + fn () => $adapter->withTransaction($callback), + $this->skipDuplicates + ); } finally { $this->pinnedAdapter = null; } diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index 4fb58ae3f..07994dbe9 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2460,6 +2460,59 @@ protected function execute(mixed $stmt): bool return $stmt->execute(); } + /** + * Build a `_uid IN (...) [AND _tenant ...]` WHERE clause (and matching binds) + * for a batch of documents. Used by the skipDuplicates pre-filter and the + * race-condition reconciliation path. + * + * @param array $documents + * @param string $bindPrefix Prefix for bind keys; must differ between concurrent uses. + * @return array{where: string, tenantSelect: string, binds: array} + */ + private function buildUidTenantLookup(array $documents, string $bindPrefix): array + { + $binds = []; + $placeholders = []; + $uids = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($v) => $v !== null + ))); + foreach ($uids as $i => $uid) { + $key = ':' . $bindPrefix . 'uid_' . $i; + $placeholders[] = $key; + $binds[$key] = $uid; + } + + $tenantFilter = ''; + $tenantSelect = ''; + if ($this->sharedTables) { + if ($this->tenantPerDocument) { + $tenants = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getTenant(), $documents), + fn ($v) => $v !== null + ))); + $tenantPlaceholders = []; + foreach ($tenants as $j => $tenant) { + $tKey = ':' . $bindPrefix . 'tenant_' . $j; + $tenantPlaceholders[] = $tKey; + $binds[$tKey] = $tenant; + } + $tenantFilter = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; + $tenantSelect = ', _tenant'; + } else { + $tenantKey = ':' . $bindPrefix . 'tenant'; + $tenantFilter = ' AND _tenant = ' . $tenantKey; + $binds[$tenantKey] = $this->getTenant(); + } + } + + return [ + 'where' => '_uid IN (' . \implode(', ', $placeholders) . ')' . $tenantFilter, + 'tenantSelect' => $tenantSelect, + 'binds' => $binds, + ]; + } + /** * Create Documents in batches * @@ -2479,47 +2532,17 @@ public function createDocuments(Document $collection, array $documents): array // Pre-filter existing UIDs to prevent race-condition duplicates. if ($this->skipDuplicates) { - $collectionId = $collection->getId(); - $name = $this->filter($collectionId); - $uids = \array_filter(\array_map(fn (Document $doc) => $doc->getId(), $documents), fn ($v) => $v !== null); + $name = $this->filter($collection->getId()); + $lookup = $this->buildUidTenantLookup($documents, '_dup_'); - if (!empty($uids)) { + if (!empty($lookup['binds'])) { try { - $placeholders = []; - $binds = []; - foreach (\array_values(\array_unique($uids)) as $i => $uid) { - $key = ':_dup_uid_' . $i; - $placeholders[] = $key; - $binds[$key] = $uid; - } - - $tenantFilter = ''; - if ($this->sharedTables) { - if ($this->tenantPerDocument) { - $tenants = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getTenant(), $documents), - fn ($v) => $v !== null - ))); - $tenantPlaceholders = []; - foreach ($tenants as $j => $tenant) { - $tKey = ':_dup_tenant_' . $j; - $tenantPlaceholders[] = $tKey; - $binds[$tKey] = $tenant; - } - $tenantFilter = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; - } else { - $tenantFilter = ' AND _tenant = :_dup_tenant'; - $binds[':_dup_tenant'] = $this->getTenant(); - } - } - - $tenantSelect = $this->sharedTables && $this->tenantPerDocument ? ', _tenant' : ''; - $sql = 'SELECT _uid' . $tenantSelect . ' FROM ' . $this->getSQLTable($name) - . ' WHERE _uid IN (' . \implode(', ', $placeholders) . ')' - . $tenantFilter; + $sql = 'SELECT _uid' . $lookup['tenantSelect'] + . ' FROM ' . $this->getSQLTable($name) + . ' WHERE ' . $lookup['where']; $stmt = $this->getPDO()->prepare($sql); - foreach ($binds as $k => $v) { + foreach ($lookup['binds'] as $k => $v) { $stmt->bindValue($k, $v, $this->getPDOType($v)); } $stmt->execute(); @@ -2670,44 +2693,13 @@ public function createDocuments(Document $collection, array $documents): array $expectedTimestamps[$eKey] = $doc->getCreatedAt(); } - $verifyPlaceholders = []; - $verifyBinds = []; - $rawUids = \array_values(\array_unique(\array_map(fn (Document $doc) => $doc->getId(), $documents))); - foreach ($rawUids as $idx => $uid) { - $ph = ':_vfy_' . $idx; - $verifyPlaceholders[] = $ph; - $verifyBinds[$ph] = $uid; - } - - $tenantWhere = ''; - $tenantSelect = ''; - if ($this->sharedTables) { - if ($this->tenantPerDocument) { - $tenants = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getTenant(), $documents), - fn ($v) => $v !== null - ))); - $tenantPlaceholders = []; - foreach ($tenants as $j => $tenant) { - $tKey = ':_vfy_tenant_' . $j; - $tenantPlaceholders[] = $tKey; - $verifyBinds[$tKey] = $tenant; - } - $tenantWhere = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; - $tenantSelect = ', _tenant'; - } else { - $tenantWhere = ' AND _tenant = :_vfy_tenant'; - $verifyBinds[':_vfy_tenant'] = $this->getTenant(); - } - } - - $verifySql = 'SELECT _uid' . $tenantSelect . ', ' . $this->quote($this->filter('_createdAt')) + $lookup = $this->buildUidTenantLookup($documents, '_vfy_'); + $verifySql = 'SELECT _uid' . $lookup['tenantSelect'] . ', ' . $this->quote($this->filter('_createdAt')) . ' FROM ' . $this->getSQLTable($name) - . ' WHERE _uid IN (' . \implode(', ', $verifyPlaceholders) . ')' - . $tenantWhere; + . ' WHERE ' . $lookup['where']; $verifyStmt = $this->getPDO()->prepare($verifySql); - foreach ($verifyBinds as $k => $v) { + foreach ($lookup['binds'] as $k => $v) { $verifyStmt->bindValue($k, $v, $this->getPDOType($v)); } $verifyStmt->execute(); diff --git a/src/Database/Database.php b/src/Database/Database.php index 146ceacaf..626334c2b 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -844,8 +844,12 @@ public function skipRelationshipsExistCheck(callable $callback): mixed } } - public function skipDuplicates(callable $callback): mixed + public function skipDuplicates(callable $callback, bool $enable = true): mixed { + if (!$enable) { + return $callback(); + } + $previous = $this->skipDuplicates; $this->skipDuplicates = true; @@ -856,6 +860,83 @@ public function skipDuplicates(callable $callback): mixed } } + /** + * Build a tenant-aware identity key for a document. + * Returns ":" in tenant-per-document shared-table mode, otherwise just the id. + */ + private function tenantKey(Document $document): string + { + return ($this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument()) + ? $document->getTenant() . ':' . $document->getId() + : $document->getId(); + } + + /** + * Batch-fetch documents by ID from a collection, keyed by tenantKey(). + * Chunks by maxQueryValues and groups queries by tenant in tenant-per-document mode. + * + * @param Document $collection + * @param array $documents Source documents (only IDs are read) + * @param bool $idsOnly When true, uses Query::select(['$id']) for a lighter fetch + * @return array + */ + private function fetchExistingByIds(Document $collection, array $documents, bool $idsOnly = false): array + { + $collectionId = $collection->getId(); + $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); + $result = []; + + $buildQueries = function (array $chunk) use ($idsOnly) { + $queries = [Query::equal('$id', $chunk)]; + if ($idsOnly) { + $queries[] = Query::select(['$id']); + } + $queries[] = Query::limit(\count($chunk)); + return $queries; + }; + + if ($tenantPerDocument) { + $idsByTenant = []; + foreach ($documents as $doc) { + $id = $doc->getId(); + if ($id !== '') { + $idsByTenant[$doc->getTenant()][] = $id; + } + } + foreach ($idsByTenant as $tenant => $tenantIds) { + $tenantIds = \array_values(\array_unique($tenantIds)); + foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { + $fetched = $this->authorization->skip( + fn () => $this->withTenant( + $tenant, + fn () => $this->silent(fn () => $this->find($collectionId, $buildQueries($idChunk))) + ) + ); + foreach ($fetched as $doc) { + $result[$tenant . ':' . $doc->getId()] = $doc; + } + } + } + + return $result; + } + + $ids = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($v) => $v !== null && $v !== '' + ))); + foreach (\array_chunk($ids, \max(1, $this->maxQueryValues)) as $idChunk) { + $fetched = $this->authorization->skip( + fn () => $this->silent(fn () => $this->find($collectionId, $buildQueries($idChunk))) + ); + foreach ($fetched as $doc) { + $result[$doc->getId()] = $doc; + } + } + + return $result; + } + /** * Trigger callback for events * @@ -5667,18 +5748,13 @@ public function createDocuments( $time = DateTime::now(); $modified = 0; - $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); - // Deduplicate intra-batch documents by ID (tenant-aware). First occurrence wins. if ($this->skipDuplicates) { $seenIds = []; $deduplicated = []; foreach ($documents as $document) { - $docId = $document->getId(); - if ($docId !== '') { - $dedupeKey = $tenantPerDocument - ? $document->getTenant() . ':' . $docId - : $docId; + if ($document->getId() !== '') { + $dedupeKey = $this->tenantKey($document); if (isset($seenIds[$dedupeKey])) { continue; } @@ -5690,50 +5766,9 @@ public function createDocuments( } // Pre-fetch existing IDs to skip relationship writes for known duplicates - $preExistingIds = []; - if ($this->skipDuplicates) { - if ($tenantPerDocument) { - $idsByTenant = []; - foreach ($documents as $doc) { - $idsByTenant[$doc->getTenant()][] = $doc->getId(); - } - foreach ($idsByTenant as $tenant => $tenantIds) { - $tenantIds = \array_values(\array_unique(\array_filter($tenantIds, fn ($v) => $v !== null))); - foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { - $existing = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent(fn () => $this->find( - $collection->getId(), - [ - Query::equal('$id', $idChunk), - Query::select(['$id']), - Query::limit(\count($idChunk)), - ] - )))); - foreach ($existing as $doc) { - $preExistingIds[$tenant . ':' . $doc->getId()] = true; - } - } - } - } else { - $inputIds = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($v) => $v !== null - ))); - - foreach (\array_chunk($inputIds, \max(1, $this->maxQueryValues)) as $idChunk) { - $existing = $this->authorization->skip(fn () => $this->silent(fn () => $this->find( - $collection->getId(), - [ - Query::equal('$id', $idChunk), - Query::select(['$id']), - Query::limit(\count($idChunk)), - ] - ))); - foreach ($existing as $doc) { - $preExistingIds[$doc->getId()] = true; - } - } - } - } + $preExistingIds = $this->skipDuplicates + ? $this->fetchExistingByIds($collection, $documents, idsOnly: true) + : []; /** @var array> $deferredRelationships */ $deferredRelationships = []; @@ -5782,7 +5817,7 @@ public function createDocuments( } if ($this->resolveRelationships && !empty($relationships)) { - // Defer: store relationship data, strip attributes for INSERT. + // Defer relationship writes until after INSERT so we don't orphan records for duplicates. $relationshipData = []; foreach ($relationships as $rel) { $key = $rel['key']; @@ -5793,19 +5828,10 @@ public function createDocuments( $document->removeAttribute($key); } if (!empty($relationshipData)) { - $deferKey = $tenantPerDocument - ? $document->getTenant() . ':' . $document->getId() - : $document->getId(); - $deferredRelationships[$deferKey] = $relationshipData; + $deferredRelationships[$this->tenantKey($document)] = $relationshipData; } } elseif ($this->resolveRelationships) { - $preExistKey = $tenantPerDocument - ? $document->getTenant() . ':' . $document->getId() - : $document->getId(); - - if (!isset($preExistingIds[$preExistKey])) { - $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); - } + $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); } $document = $this->adapter->castingBefore($collection, $document); @@ -5813,30 +5839,24 @@ public function createDocuments( foreach (\array_chunk($documents, $batchSize) as $chunk) { if ($this->skipDuplicates && !empty($preExistingIds)) { - $chunk = \array_values(\array_filter($chunk, function (Document $doc) use ($preExistingIds, $tenantPerDocument) { - $key = $tenantPerDocument - ? $doc->getTenant() . ':' . $doc->getId() - : $doc->getId(); - return !isset($preExistingIds[$key]); - })); + $chunk = \array_values(\array_filter( + $chunk, + fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + )); if (empty($chunk)) { continue; } } - $batch = $this->withTransaction(function () use ($collection, $chunk) { - $createFn = fn () => $this->adapter->createDocuments($collection, $chunk); - return $this->skipDuplicates - ? $this->adapter->skipDuplicates($createFn) - : $createFn(); - }); + $batch = $this->withTransaction(fn () => $this->adapter->skipDuplicates( + fn () => $this->adapter->createDocuments($collection, $chunk), + $this->skipDuplicates + )); // Create deferred relationships only for docs that were actually inserted if ($this->skipDuplicates && $this->resolveRelationships && \count($deferredRelationships) > 0) { foreach ($batch as $insertedDoc) { - $deferKey = $tenantPerDocument - ? $insertedDoc->getTenant() . ':' . $insertedDoc->getId() - : $insertedDoc->getId(); + $deferKey = $this->tenantKey($insertedDoc); if (\array_key_exists($deferKey, $deferredRelationships)) { $relDoc = clone $insertedDoc; foreach ($deferredRelationships[$deferKey] as $key => $value) { @@ -7262,49 +7282,10 @@ public function upsertDocumentsWithIncrease( $seenIds = []; // Batch-fetch existing documents in one query instead of N individual getDocument() calls - $ids = \array_filter(\array_map(fn ($doc) => $doc->getId(), $documents), fn ($v) => $v !== null); - $existingDocs = []; - $upsertTenantPerDocument = $this->getSharedTables() && $this->getTenantPerDocument(); - - if (!empty($ids)) { - $uniqueIds = \array_values(\array_unique($ids)); - - if ($upsertTenantPerDocument) { - $idsByTenant = []; - foreach ($documents as $doc) { - $tenant = $doc->getTenant(); - $idsByTenant[$tenant][] = $doc->getId(); - } - foreach ($idsByTenant as $tenant => $tenantIds) { - $tenantIds = \array_values(\array_unique(\array_filter($tenantIds, fn ($v) => $v !== null))); - foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { - $fetched = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent(fn () => $this->find( - $collection->getId(), - [Query::equal('$id', $idChunk), Query::limit(\count($idChunk))], - )))); - foreach ($fetched as $doc) { - $existingDocs[$tenant . ':' . $doc->getId()] = $doc; - } - } - } - } else { - foreach (\array_chunk($uniqueIds, \max(1, $this->maxQueryValues)) as $idChunk) { - $fetched = $this->authorization->skip(fn () => $this->silent(fn () => $this->find( - $collection->getId(), - [Query::equal('$id', $idChunk), Query::limit(\count($idChunk))], - ))); - foreach ($fetched as $doc) { - $existingDocs[$doc->getId()] = $doc; - } - } - } - } + $existingDocs = $this->fetchExistingByIds($collection, $documents); foreach ($documents as $key => $document) { - $lookupKey = $upsertTenantPerDocument - ? $document->getTenant() . ':' . $document->getId() - : $document->getId(); - $old = $existingDocs[$lookupKey] ?? new Document(); + $old = $existingDocs[$this->tenantKey($document)] ?? new Document(); // Extract operators early to avoid comparison issues $documentArray = $document->getArrayCopy(); @@ -7471,9 +7452,7 @@ public function upsertDocumentsWithIncrease( $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); } - $seenIds[] = $upsertTenantPerDocument - ? $document->getTenant() . ':' . $document->getId() - : $document->getId(); + $seenIds[] = $this->tenantKey($document); $old = $this->adapter->castingBefore($collection, $old); $document = $this->adapter->castingBefore($collection, $document); diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 5f3ae640c..5d5777a5d 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,18 +601,17 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { - $createFn = fn () => $this->source->createDocuments( - $collection, - $documents, - $batchSize, - $onNext, - $onError, + $modified = $this->source->skipDuplicates( + fn () => $this->source->createDocuments( + $collection, + $documents, + $batchSize, + $onNext, + $onError, + ), + $this->skipDuplicates ); - $modified = $this->skipDuplicates - ? $this->source->skipDuplicates($createFn) - : $createFn(); - if ( \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) || $this->destination === null @@ -643,19 +642,17 @@ public function createDocuments( $clones[] = $clone; } - $destFn = fn () => $this->destination->withPreserveDates( - fn () => - $this->destination->createDocuments( - $collection, - $clones, - $batchSize, - ) + $this->destination->skipDuplicates( + fn () => $this->destination->withPreserveDates( + fn () => $this->destination->createDocuments( + $collection, + $clones, + $batchSize, + ) + ), + $this->skipDuplicates ); - $this->skipDuplicates - ? $this->destination->skipDuplicates($destFn) - : $destFn(); - foreach ($clones as $clone) { foreach ($this->writeFilters as $filter) { $filter->afterCreateDocument( From c88c6ac52715fe87ed22eec69a27b4d640341b1a Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Sun, 12 Apr 2026 14:52:46 +0100 Subject: [PATCH 08/25] fix: guard buildUidTenantLookup against empty UID set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a batch has only null-ID documents but non-null tenants, the previous code emitted WHERE _uid IN () AND _tenant IN (...) — invalid SQL that the !empty(binds) call-site guard failed to catch because the tenant binds kept the binds array non-empty. Return 'where' => '1=0' with empty binds when there are no UIDs, so the query matches nothing and leaves no dangling placeholders. --- src/Database/Adapter/SQL.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index 07994dbe9..f2e84fa30 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2506,6 +2506,15 @@ private function buildUidTenantLookup(array $documents, string $bindPrefix): arr } } + // No UIDs to match → return a clause that matches nothing without leaving dangling binds. + if (empty($placeholders)) { + return [ + 'where' => '1=0', + 'tenantSelect' => $tenantSelect, + 'binds' => [], + ]; + } + return [ 'where' => '_uid IN (' . \implode(', ', $placeholders) . ')' . $tenantFilter, 'tenantSelect' => $tenantSelect, From 16849fe1e90b4ad85f0527afdd935278576aff1f Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 03:30:39 +0100 Subject: [PATCH 09/25] Add e2e tests for skipDuplicates edge cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - testCreateDocumentsSkipDuplicatesEmptyBatch: no-op with empty array - testCreateDocumentsSkipDuplicatesNestedScope: scope guard save/restore through nested skipDuplicates() calls - testCreateDocumentsSkipDuplicatesLargeBatch: 300-doc batch with 50 pre-existing, exercising the chunk loop - testCreateDocumentsSkipDuplicatesSecondCallSkipsAll: second identical batch should preserve First values, not overwrite with Second - testCreateDocumentsSkipDuplicatesRelationships: skipDuplicates with a one-to-many relationship — verifies deferred relationship logic does not create orphan child rows for ignored parents --- tests/e2e/Adapter/Scopes/DocumentTests.php | 277 +++++++++++++++++++++ 1 file changed, 277 insertions(+) diff --git a/tests/e2e/Adapter/Scopes/DocumentTests.php b/tests/e2e/Adapter/Scopes/DocumentTests.php index 5f82cc025..d3b5d0f0a 100644 --- a/tests/e2e/Adapter/Scopes/DocumentTests.php +++ b/tests/e2e/Adapter/Scopes/DocumentTests.php @@ -7926,4 +7926,281 @@ public function testCreateDocumentsIgnoreAllDuplicates(): void $all = $database->find(__FUNCTION__); $this->assertCount(1, $all); } + + public function testCreateDocumentsSkipDuplicatesEmptyBatch(): void + { + $database = $this->getDatabase(); + + $collection = 'skipDupEmpty'; + $database->createCollection($collection); + $database->createAttribute($collection, 'name', Database::VAR_STRING, 128, true); + + $count = $database->skipDuplicates(fn () => $database->createDocuments($collection, [])); + + $this->assertSame(0, $count); + $this->assertCount(0, $database->find($collection)); + } + + public function testCreateDocumentsSkipDuplicatesNestedScope(): void + { + $database = $this->getDatabase(); + + $collection = 'skipDupNested'; + $database->createCollection($collection); + $database->createAttribute($collection, 'name', Database::VAR_STRING, 128, true); + + $makeDoc = fn (string $id, string $name) => new Document([ + '$id' => $id, + 'name' => $name, + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]); + + // Seed an existing doc + $database->createDocuments($collection, [$makeDoc('seed', 'Seed')]); + + // Nested scope — inner scope runs inside outer scope. + // After inner exits, outer state should still be "skip enabled". + // After outer exits, state should restore to "skip disabled". + $countOuter = $database->skipDuplicates(function () use ($database, $collection, $makeDoc) { + // Inner scope: add dup + new + $countInner = $database->skipDuplicates(function () use ($database, $collection, $makeDoc) { + return $database->createDocuments($collection, [ + $makeDoc('seed', 'Dup'), + $makeDoc('innerNew', 'InnerNew'), + ]); + }); + $this->assertSame(1, $countInner, 'Inner scope should insert only innerNew'); + + // Still inside outer scope — skip flag should still be on + return $database->createDocuments($collection, [ + $makeDoc('seed', 'Dup2'), + $makeDoc('outerNew', 'OuterNew'), + ]); + }); + $this->assertSame(1, $countOuter, 'Outer scope should insert only outerNew'); + + // After both scopes exit, skip flag is off again — a plain createDocuments + // call with a duplicate should throw. + $thrown = null; + try { + $database->createDocuments($collection, [$makeDoc('seed', 'ShouldThrow')]); + } catch (DuplicateException $e) { + $thrown = $e; + } + $this->assertNotNull($thrown, 'Plain createDocuments after nested scopes should throw on duplicate'); + + // Final state: seed + innerNew + outerNew + $all = $database->find($collection); + $ids = \array_map(fn (Document $d) => $d->getId(), $all); + \sort($ids); + $this->assertSame(['innerNew', 'outerNew', 'seed'], $ids); + } + + public function testCreateDocumentsSkipDuplicatesLargeBatch(): void + { + $database = $this->getDatabase(); + + $collection = 'skipDupLarge'; + $database->createCollection($collection); + $database->createAttribute($collection, 'idx', Database::VAR_INTEGER, 0, true); + + // Seed 50 docs + $seed = []; + for ($i = 0; $i < 50; $i++) { + $seed[] = new Document([ + '$id' => 'doc_' . $i, + 'idx' => $i, + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]); + } + $database->createDocuments($collection, $seed); + + // Now call skipDuplicates with 300 docs: 50 existing (0-49) + 250 new (50-299). + // 300 > default INSERT_BATCH_SIZE, so this exercises the chunk loop. + $batch = []; + for ($i = 0; $i < 300; $i++) { + $batch[] = new Document([ + '$id' => 'doc_' . $i, + 'idx' => $i + 1000, // different value so we can detect if existing got overwritten + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]); + } + + $emittedIds = []; + $count = $database->skipDuplicates(function () use ($database, $collection, $batch, &$emittedIds) { + return $database->createDocuments($collection, $batch, onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }); + }); + + $this->assertSame(250, $count, '250 new docs should have been inserted'); + $this->assertCount(250, $emittedIds); + + // The 50 seed docs must still have their original idx values (not overwritten). + $seedDoc = $database->getDocument($collection, 'doc_25'); + $this->assertSame(25, $seedDoc->getAttribute('idx'), 'Existing docs must not be overwritten'); + + // A newly-inserted doc should have the batch value. + $newDoc = $database->getDocument($collection, 'doc_100'); + $this->assertSame(1100, $newDoc->getAttribute('idx')); + + // Total count: 300 + $total = $database->count($collection); + $this->assertSame(300, $total); + } + + public function testCreateDocumentsSkipDuplicatesSecondCallSkipsAll(): void + { + $database = $this->getDatabase(); + + $collection = 'skipDupSecond'; + $database->createCollection($collection); + $database->createAttribute($collection, 'name', Database::VAR_STRING, 128, true); + + $makeBatch = fn (string $name) => \array_map( + fn (string $id) => new Document([ + '$id' => $id, + 'name' => $name, + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ['a', 'b', 'c'] + ); + + // First call — all new + $firstCount = $database->skipDuplicates( + fn () => $database->createDocuments($collection, $makeBatch('First')) + ); + $this->assertSame(3, $firstCount); + + // Second call — identical ids, all pre-existing, should be silently skipped + $emittedIds = []; + $secondCount = $database->skipDuplicates(function () use ($database, $collection, $makeBatch, &$emittedIds) { + return $database->createDocuments($collection, $makeBatch('Second'), onNext: function (Document $doc) use (&$emittedIds) { + $emittedIds[] = $doc->getId(); + }); + }); + $this->assertSame(0, $secondCount); + $this->assertSame([], $emittedIds); + + // All three should retain the First values + foreach (['a', 'b', 'c'] as $id) { + $doc = $database->getDocument($collection, $id); + $this->assertSame('First', $doc->getAttribute('name'), "Doc {$id} should not have been overwritten"); + } + } + + public function testCreateDocumentsSkipDuplicatesRelationships(): void + { + $database = $this->getDatabase(); + + if (!$database->getAdapter()->getSupportForRelationships()) { + $this->expectNotToPerformAssertions(); + return; + } + + $parent = 'skipDupParent'; + $child = 'skipDupChild'; + $permissions = [ + Permission::read(Role::any()), + Permission::create(Role::any()), + Permission::update(Role::any()), + Permission::delete(Role::any()), + ]; + + $database->createCollection($parent); + $database->createCollection($child); + $database->createAttribute($parent, 'name', Database::VAR_STRING, 128, true); + $database->createAttribute($child, 'name', Database::VAR_STRING, 128, true); + $database->createRelationship( + collection: $parent, + relatedCollection: $child, + type: Database::RELATION_ONE_TO_MANY, + id: 'children', + ); + + // Seed: existing parent with one existing child. + $database->createDocument($parent, new Document([ + '$id' => 'existingParent', + 'name' => 'ExistingParent', + '$permissions' => $permissions, + 'children' => [ + new Document([ + '$id' => 'existingChild', + 'name' => 'ExistingChild', + '$permissions' => $permissions, + ]), + ], + ])); + + // Batch: existing parent (with children that should be IGNORED because the + // parent already exists) + new parent (with children that SHOULD be created). + $batch = [ + new Document([ + '$id' => 'existingParent', + 'name' => 'ShouldNotOverwrite', + '$permissions' => $permissions, + 'children' => [ + new Document([ + '$id' => 'orphanChild', + 'name' => 'OrphanChild', + '$permissions' => $permissions, + ]), + ], + ]), + new Document([ + '$id' => 'newParent', + 'name' => 'NewParent', + '$permissions' => $permissions, + 'children' => [ + new Document([ + '$id' => 'newChild', + 'name' => 'NewChild', + '$permissions' => $permissions, + ]), + ], + ]), + ]; + + $count = $database->skipDuplicates(fn () => $database->createDocuments($parent, $batch)); + $this->assertSame(1, $count, 'Only newParent should be inserted'); + + // existingParent untouched + $existing = $database->getDocument($parent, 'existingParent'); + $this->assertFalse($existing->isEmpty()); + $this->assertSame('ExistingParent', $existing->getAttribute('name'), 'Existing parent name must not be overwritten'); + $existingChildren = $existing->getAttribute('children', []); + $this->assertCount(1, $existingChildren); + $this->assertSame('existingChild', $existingChildren[0]->getId()); + + // newParent and its child exist + $new = $database->getDocument($parent, 'newParent'); + $this->assertFalse($new->isEmpty()); + $this->assertSame('NewParent', $new->getAttribute('name')); + $newChildren = $new->getAttribute('children', []); + $this->assertCount(1, $newChildren); + $this->assertSame('newChild', $newChildren[0]->getId()); + + // Most important assertion: the child record from the ignored parent entry + // must NOT have been written — no orphan rows. + $orphan = $database->getDocument($child, 'orphanChild'); + $this->assertTrue($orphan->isEmpty(), 'orphanChild must not exist (deferred relationship was correctly skipped for the ignored parent)'); + + // Child collection should contain exactly 2 docs: existingChild + newChild + $allChildren = $database->find($child); + $childIds = \array_map(fn (Document $d) => $d->getId(), $allChildren); + \sort($childIds); + $this->assertSame(['existingChild', 'newChild'], $childIds); + } } From c6d566ee9feade4488086db2ff8eff0d8a187a88 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 03:30:52 +0100 Subject: [PATCH 10/25] SQL adapter: remove redundant skipDuplicates pre-filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INSERT IGNORE / ON CONFLICT DO NOTHING / INSERT OR IGNORE handle duplicates at the DB level, and the existing reconciliation block identifies actually-inserted rows via the _createdAt timestamp. The pre-filter was a wire-data optimization that ran a SELECT before the INSERT, then trimmed the batch — but the same SELECT (with timestamp comparison) already happens in the reconciliation path on demand. Net: ~45 lines removed, single code path for skipDuplicates instead of pre-filter + fallback. Race-condition handling is unchanged (reconciliation path covers it). --- src/Database/Adapter/SQL.php | 49 ++---------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index f2e84fa30..e320a5b70 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2462,8 +2462,8 @@ protected function execute(mixed $stmt): bool /** * Build a `_uid IN (...) [AND _tenant ...]` WHERE clause (and matching binds) - * for a batch of documents. Used by the skipDuplicates pre-filter and the - * race-condition reconciliation path. + * for a batch of documents. Used by skipDuplicates reconciliation to identify + * actually-inserted rows after an INSERT IGNORE / ON CONFLICT DO NOTHING. * * @param array $documents * @param string $bindPrefix Prefix for bind keys; must differ between concurrent uses. @@ -2539,51 +2539,6 @@ public function createDocuments(Document $collection, array $documents): array return $documents; } - // Pre-filter existing UIDs to prevent race-condition duplicates. - if ($this->skipDuplicates) { - $name = $this->filter($collection->getId()); - $lookup = $this->buildUidTenantLookup($documents, '_dup_'); - - if (!empty($lookup['binds'])) { - try { - $sql = 'SELECT _uid' . $lookup['tenantSelect'] - . ' FROM ' . $this->getSQLTable($name) - . ' WHERE ' . $lookup['where']; - - $stmt = $this->getPDO()->prepare($sql); - foreach ($lookup['binds'] as $k => $v) { - $stmt->bindValue($k, $v, $this->getPDOType($v)); - } - $stmt->execute(); - $rows = $stmt->fetchAll(); - $stmt->closeCursor(); - - if ($this->sharedTables && $this->tenantPerDocument) { - $existingKeys = []; - foreach ($rows as $row) { - $existingKeys[$row['_tenant'] . ':' . $row['_uid']] = true; - } - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($existingKeys[$doc->getTenant() . ':' . $doc->getId()]) - )); - } else { - $existingUids = \array_flip(\array_column($rows, '_uid')); - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($existingUids[$doc->getId()]) - )); - } - } catch (PDOException $e) { - throw $this->processException($e); - } - } - - if (empty($documents)) { - return []; - } - } - $spatialAttributes = $this->getSpatialAttributes($collection); $collection = $collection->getId(); try { From fd37b69b5db955e555c8e3e7598fda97358ba694 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 03:31:11 +0100 Subject: [PATCH 11/25] Mongo adapter: use upsertWithCounts for race-safe accurate counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the Database orchestrator carried a pre-fetch + chunk-filter pass to compensate for Mongo's plain upsert() returning matched + upserted combined ('n') — without that compensation, $modified would over-count the number of pre-existing docs in the batch. This pushes the responsibility into the adapter, where it belongs: - composer bump to utopia-php/mongo dev-upsert-return-upserted-count (PR #37, adds Client::upsertWithCounts() returning matched/modified/ upserted[] separately) - Mongo adapter calls upsertWithCounts() and maps the response's upserted[].index entries back to the originating Document objects, returning ONLY the actually-inserted docs (race-safe) - Database::createDocuments drops $preExistingIds pre-fetch and the chunk filter; the adapter is now responsible for accurate counts Net effect on Database.php: ~17 lines removed. Net effect on Mongo.php: ~80 lines net deletion (replaced the inline pre-filter with a single upsertWithCounts call + index mapping). --- composer.json | 11 ++- composer.lock | 56 ++++++++----- src/Database/Adapter/Mongo.php | 138 ++++++++++++--------------------- src/Database/Database.php | 38 +++------ 4 files changed, 106 insertions(+), 137 deletions(-) diff --git a/composer.json b/composer.json index 824b193a3..ff0d71e91 100755 --- a/composer.json +++ b/composer.json @@ -4,7 +4,8 @@ "type": "library", "keywords": ["php","framework", "upf", "utopia", "database"], "license": "MIT", - "minimum-stability": "stable", + "minimum-stability": "dev", + "prefer-stable": true, "autoload": { "psr-4": {"Utopia\\Database\\": "src/Database"} }, @@ -41,7 +42,7 @@ "utopia-php/console": "0.1.*", "utopia-php/cache": "1.*", "utopia-php/pools": "1.*", - "utopia-php/mongo": "1.*" + "utopia-php/mongo": "dev-upsert-return-upserted-count" }, "require-dev": { "fakerphp/faker": "1.23.*", @@ -59,6 +60,12 @@ "mongodb/mongodb": "Needed to support MongoDB Database Adapter" }, + "repositories": [ + { + "type": "vcs", + "url": "https://github.com/utopia-php/mongo.git" + } + ], "config": { "allow-plugins": { "php-http/discovery": false, diff --git a/composer.lock b/composer.lock index ad3e77da1..b1bf6d388 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "e7e2b8f8ff6424bb98827b11e25323e8", + "content-hash": "4f64a9be4913ee68ff7df2a8db313eac", "packages": [ { "name": "brick/math", @@ -1807,16 +1807,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.33.0", + "version": "v1.34.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91" + "reference": "2c408a6bb0313e6001a83628dc5506100474254e" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91", - "reference": "d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", + "reference": "2c408a6bb0313e6001a83628dc5506100474254e", "shasum": "" }, "require": { @@ -1863,7 +1863,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.33.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.34.0" }, "funding": [ { @@ -1883,7 +1883,7 @@ "type": "tidelift" } ], - "time": "2025-06-23T16:12:55+00:00" + "time": "2026-04-10T16:50:15+00:00" }, { "name": "symfony/service-contracts", @@ -2126,16 +2126,16 @@ }, { "name": "utopia-php/mongo", - "version": "1.0.0", + "version": "dev-upsert-return-upserted-count", "source": { "type": "git", "url": "https://github.com/utopia-php/mongo.git", - "reference": "45bedf36c2c946ec7a0a3e59b9f12f772de0b01d" + "reference": "25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/utopia-php/mongo/zipball/45bedf36c2c946ec7a0a3e59b9f12f772de0b01d", - "reference": "45bedf36c2c946ec7a0a3e59b9f12f772de0b01d", + "url": "https://api.github.com/repos/utopia-php/mongo/zipball/25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5", + "reference": "25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5", "shasum": "" }, "require": { @@ -2157,7 +2157,25 @@ "Utopia\\Mongo\\": "src" } }, - "notification-url": "https://packagist.org/downloads/", + "autoload-dev": { + "psr-4": { + "Utopia\\Tests\\": "tests" + } + }, + "scripts": { + "test": [ + "phpunit" + ], + "check": [ + "vendor/bin/phpstan analyse --memory-limit=1G src" + ], + "format": [ + "vendor/bin/pint" + ], + "lint": [ + "vendor/bin/pint --test" + ] + }, "license": [ "MIT" ], @@ -2180,10 +2198,10 @@ "utopia" ], "support": { - "issues": "https://github.com/utopia-php/mongo/issues", - "source": "https://github.com/utopia-php/mongo/tree/1.0.0" + "source": "https://github.com/utopia-php/mongo/tree/upsert-return-upserted-count", + "issues": "https://github.com/utopia-php/mongo/issues" }, - "time": "2026-02-12T05:54:06+00:00" + "time": "2026-04-13T01:32:46+00:00" }, { "name": "utopia-php/pools", @@ -4582,9 +4600,11 @@ } ], "aliases": [], - "minimum-stability": "stable", - "stability-flags": {}, - "prefer-stable": false, + "minimum-stability": "dev", + "stability-flags": { + "utopia-php/mongo": 20 + }, + "prefer-stable": true, "prefer-lowest": false, "platform": { "php": ">=8.4", diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index 3f6bbc354..facf1d562 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -121,6 +121,14 @@ public function withTransaction(callable $callback): mixed return $callback(); } + // skipDuplicates uses upsert + $setOnInsert, whose filter query would hit + // WriteConflict (E112) under transaction snapshot isolation. The upsert is + // atomic on its own and relationship writes are deferred by the Database + // layer, so running outside a transaction is both safe and necessary. + if ($this->skipDuplicates) { + return $callback(); + } + try { $this->startTransaction(); $result = $callback(); @@ -1449,42 +1457,6 @@ public function castingBefore(Document $collection, Document $document): Documen return $document; } - /** - * Find existing `_uid` values matching the given filters, fully walking the cursor. - * - * @param string $name - * @param array $filters - * @param int $batchSize Hint: expected max result count — lets MongoDB return everything in one batch - * @return array - * - * @throws MongoException - */ - private function findExistingUids(string $name, array $filters, int $batchSize): array - { - $options = $this->getTransactionOptions([ - 'projection' => ['_uid' => 1], - 'batchSize' => $batchSize, - ]); - - $response = $this->client->find($name, $filters, $options); - $uids = []; - - foreach ($response->cursor->firstBatch ?? [] as $doc) { - $uids[] = $doc->_uid; - } - - $cursorId = $response->cursor->id ?? 0; - while ($cursorId != 0) { - $more = $this->client->getMore($cursorId, $name, $batchSize); - foreach ($more->cursor->nextBatch ?? [] as $doc) { - $uids[] = $doc->_uid; - } - $cursorId = $more->cursor->id ?? 0; - } - - return $uids; - } - /** * Create Documents in batches * @@ -1523,70 +1495,58 @@ public function createDocuments(Document $collection, array $documents): array $records[] = $record; } - // Pre-filter duplicates within the session — MongoDB has no INSERT IGNORE, - // so sending a duplicate would abort the entire transaction. - if ($this->skipDuplicates && !empty($records)) { - $tenantPerDoc = $this->sharedTables && $this->tenantPerDocument; - $recordKey = fn (array $record): string => $tenantPerDoc - ? ($record['_tenant'] ?? $this->getTenant()) . ':' . ($record['_uid'] ?? '') - : ($record['_uid'] ?? ''); + // skipDuplicates: use upsert + $setOnInsert so an already-present row (whether + // from a prior batch or a concurrent writer) becomes a silent no-op. MongoDB has + // no INSERT IGNORE, and plain insertMany aborts the transaction on any duplicate. + // Adapter::withTransaction opts out of a real transaction in this mode to avoid + // snapshot-isolation-induced WriteConflict errors from the upsert's filter query. + if ($this->skipDuplicates) { + if (empty($records)) { + return []; + } - $existingKeys = []; + $operations = []; + $opIndexToDocIndex = []; + foreach ($records as $i => $record) { + $filter = ['_uid' => $record['_uid'] ?? '']; + if ($this->sharedTables) { + $filter['_tenant'] = $record['_tenant'] ?? $this->getTenant(); + } - try { - if ($tenantPerDoc) { - $idsByTenant = []; - foreach ($records as $record) { - $uid = $record['_uid'] ?? ''; - if ($uid === '') { - continue; - } - $tenant = $record['_tenant'] ?? $this->getTenant(); - $idsByTenant[$tenant][] = $uid; - } + // Filter fields cannot reappear in $setOnInsert — MongoDB rejects + // the update with a path-conflict error otherwise. + $setOnInsert = $record; + unset($setOnInsert['_uid'], $setOnInsert['_tenant']); - foreach ($idsByTenant as $tenant => $tenantUids) { - $tenantUids = \array_values(\array_unique($tenantUids)); - $filters = ['_uid' => ['$in' => $tenantUids], '_tenant' => $tenant]; - foreach ($this->findExistingUids($name, $filters, \count($tenantUids)) as $uid) { - $existingKeys[$tenant . ':' . $uid] = true; - } - } - } else { - $uids = \array_values(\array_unique(\array_filter( - \array_map(fn ($r) => $r['_uid'] ?? null, $records), - fn ($v) => $v !== null - ))); - if (!empty($uids)) { - $filters = ['_uid' => ['$in' => $uids]]; - if ($this->sharedTables) { - $filters['_tenant'] = $this->getTenantFilters($collection->getId()); - } - foreach ($this->findExistingUids($name, $filters, \count($uids)) as $uid) { - $existingKeys[$uid] = true; - } - } + if (empty($setOnInsert)) { + continue; } + + $opIndexToDocIndex[\count($operations)] = $i; + $operations[] = [ + 'filter' => $filter, + 'update' => ['$setOnInsert' => $setOnInsert], + ]; + } + + try { + $result = $this->client->upsertWithCounts($name, $operations, $options); } catch (MongoException $e) { throw $this->processException($e); } - if (!empty($existingKeys)) { - $filteredRecords = []; - $filteredDocuments = []; - foreach ($records as $i => $record) { - if (!isset($existingKeys[$recordKey($record)])) { - $filteredRecords[] = $record; - $filteredDocuments[] = $documents[$i]; - } + // Map MongoDB's upserted entries back to the originating documents + // so the caller sees an exact "actually inserted" list — matched + // (pre-existing) docs are silently dropped. + $inserted = []; + foreach ($result['upserted'] as $entry) { + $docIndex = $opIndexToDocIndex[$entry['index']] ?? null; + if ($docIndex !== null) { + $inserted[] = $documents[$docIndex]; } - $records = $filteredRecords; - $documents = $filteredDocuments; } - if (empty($records)) { - return []; - } + return $inserted; } try { diff --git a/src/Database/Database.php b/src/Database/Database.php index 626334c2b..4c2820f26 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -877,23 +877,18 @@ private function tenantKey(Document $document): string * * @param Document $collection * @param array $documents Source documents (only IDs are read) - * @param bool $idsOnly When true, uses Query::select(['$id']) for a lighter fetch * @return array */ - private function fetchExistingByIds(Document $collection, array $documents, bool $idsOnly = false): array + private function fetchExistingByIds(Document $collection, array $documents): array { $collectionId = $collection->getId(); $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); $result = []; - $buildQueries = function (array $chunk) use ($idsOnly) { - $queries = [Query::equal('$id', $chunk)]; - if ($idsOnly) { - $queries[] = Query::select(['$id']); - } - $queries[] = Query::limit(\count($chunk)); - return $queries; - }; + $buildQueries = fn (array $chunk) => [ + Query::equal('$id', $chunk), + Query::limit(\count($chunk)), + ]; if ($tenantPerDocument) { $idsByTenant = []; @@ -5765,11 +5760,6 @@ public function createDocuments( $documents = $deduplicated; } - // Pre-fetch existing IDs to skip relationship writes for known duplicates - $preExistingIds = $this->skipDuplicates - ? $this->fetchExistingByIds($collection, $documents, idsOnly: true) - : []; - /** @var array> $deferredRelationships */ $deferredRelationships = []; $relationships = []; @@ -5838,20 +5828,12 @@ public function createDocuments( } foreach (\array_chunk($documents, $batchSize) as $chunk) { - if ($this->skipDuplicates && !empty($preExistingIds)) { - $chunk = \array_values(\array_filter( - $chunk, - fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) - )); - if (empty($chunk)) { - continue; - } - } - - $batch = $this->withTransaction(fn () => $this->adapter->skipDuplicates( - fn () => $this->adapter->createDocuments($collection, $chunk), + // skipDuplicates wraps withTransaction so the adapter's flag is set before + // Mongo::withTransaction decides whether to start a real transaction. + $batch = $this->adapter->skipDuplicates( + fn () => $this->withTransaction(fn () => $this->adapter->createDocuments($collection, $chunk)), $this->skipDuplicates - )); + ); // Create deferred relationships only for docs that were actually inserted if ($this->skipDuplicates && $this->resolveRelationships && \count($deferredRelationships) > 0) { From 3b783afe103d77045143231425f18b6766076926 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 04:27:03 +0100 Subject: [PATCH 12/25] Revert "SQL adapter: remove redundant skipDuplicates pre-filter" This reverts commit c6d566ee9feade4488086db2ff8eff0d8a187a88. --- src/Database/Adapter/SQL.php | 49 ++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index e320a5b70..f2e84fa30 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2462,8 +2462,8 @@ protected function execute(mixed $stmt): bool /** * Build a `_uid IN (...) [AND _tenant ...]` WHERE clause (and matching binds) - * for a batch of documents. Used by skipDuplicates reconciliation to identify - * actually-inserted rows after an INSERT IGNORE / ON CONFLICT DO NOTHING. + * for a batch of documents. Used by the skipDuplicates pre-filter and the + * race-condition reconciliation path. * * @param array $documents * @param string $bindPrefix Prefix for bind keys; must differ between concurrent uses. @@ -2539,6 +2539,51 @@ public function createDocuments(Document $collection, array $documents): array return $documents; } + // Pre-filter existing UIDs to prevent race-condition duplicates. + if ($this->skipDuplicates) { + $name = $this->filter($collection->getId()); + $lookup = $this->buildUidTenantLookup($documents, '_dup_'); + + if (!empty($lookup['binds'])) { + try { + $sql = 'SELECT _uid' . $lookup['tenantSelect'] + . ' FROM ' . $this->getSQLTable($name) + . ' WHERE ' . $lookup['where']; + + $stmt = $this->getPDO()->prepare($sql); + foreach ($lookup['binds'] as $k => $v) { + $stmt->bindValue($k, $v, $this->getPDOType($v)); + } + $stmt->execute(); + $rows = $stmt->fetchAll(); + $stmt->closeCursor(); + + if ($this->sharedTables && $this->tenantPerDocument) { + $existingKeys = []; + foreach ($rows as $row) { + $existingKeys[$row['_tenant'] . ':' . $row['_uid']] = true; + } + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($existingKeys[$doc->getTenant() . ':' . $doc->getId()]) + )); + } else { + $existingUids = \array_flip(\array_column($rows, '_uid')); + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($existingUids[$doc->getId()]) + )); + } + } catch (PDOException $e) { + throw $this->processException($e); + } + } + + if (empty($documents)) { + return []; + } + } + $spatialAttributes = $this->getSpatialAttributes($collection); $collection = $collection->getId(); try { From 3a483f2c66e3abdfd3682554e7212cb78093bfd1 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 04:51:36 +0100 Subject: [PATCH 13/25] Mirror: forward only docs the source actually inserted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously Mirror::createDocuments built its destination payload from the original input ($documents), not from the docs the source actually persisted. In skipDuplicates mode that lets the destination diverge from the source whenever the source no-ops a write: source has id=a, name='Original' dest missing id=a caller skipDuplicates(createDocuments([{id: a, name: 'Duplicate'}])) The source skips a (already present) but the Mirror still forwards {id: a, name: 'Duplicate'} to the destination, which inserts it — the two replicas now hold different values for the same row. The same code path also throws "All documents must have an sequence if one is set" inside Database::createDocuments when a partially-skipped batch reaches the destination, because the inserted doc has a sequence assigned by the source adapter while the skipped doc does not. Mirror catches and silently logs that exception, so the destination ends up missing both rows. Fix: wrap the user's onNext to capture the docs the source actually persisted, then forward only those to the destination. Skipped docs are now correctly never mirrored. Adds testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination — seeds a row in the source only (bypassing the mirror), then calls skipDuplicates(createDocuments(...)) with a different value for that id plus a new row. Asserts source keeps its original value, destination ends up with only the new row, and the skipped value never reaches either side. --- src/Database/Mirror.php | 22 +++++++- tests/e2e/Adapter/MirrorTest.php | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 5d5777a5d..901ae4d04 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,12 +601,25 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { + // Capture the docs the source actually persisted (rather than the input) + // so that, in skipDuplicates mode, we don't end up forwarding skipped + // duplicates to the destination — that would let the destination diverge + // from the source whenever the source no-ops a write. + /** @var array $insertedFromSource */ + $insertedFromSource = []; + $captureOnNext = function (Document $doc) use (&$insertedFromSource, $onNext): void { + $insertedFromSource[] = $doc; + if ($onNext !== null) { + $onNext($doc); + } + }; + $modified = $this->source->skipDuplicates( fn () => $this->source->createDocuments( $collection, $documents, $batchSize, - $onNext, + $captureOnNext, $onError, ), $this->skipDuplicates @@ -624,10 +637,15 @@ public function createDocuments( return $modified; } + // Nothing for the source to mirror — early-out before touching destination. + if (empty($insertedFromSource)) { + return $modified; + } + try { $clones = []; - foreach ($documents as $document) { + foreach ($insertedFromSource as $document) { $clone = clone $document; foreach ($this->writeFilters as $filter) { diff --git a/tests/e2e/Adapter/MirrorTest.php b/tests/e2e/Adapter/MirrorTest.php index 31bf3f3b6..634c2df0d 100644 --- a/tests/e2e/Adapter/MirrorTest.php +++ b/tests/e2e/Adapter/MirrorTest.php @@ -313,6 +313,95 @@ public function testDeleteMirroredDocument(): void $this->assertTrue($database->getDestination()->getDocument('testDeleteMirroredDocument', $document->getId())->isEmpty()); } + /** + * Regression: when skipDuplicates causes the source to no-op a write, + * the Mirror must NOT forward the input document to the destination. + * Otherwise the destination would diverge from the source — receiving + * the would-be value for a row the source kept unchanged. + */ + public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): void + { + $database = $this->getDatabase(); + $collection = 'mirrorSkipDup'; + + $database->createCollection($collection, attributes: [ + new Document([ + '$id' => 'name', + 'type' => Database::VAR_STRING, + 'required' => true, + 'size' => Database::LENGTH_KEY, + ]), + ], permissions: [ + Permission::create(Role::any()), + Permission::read(Role::any()), + ], documentSecurity: false); + + // Seed the SOURCE only (bypass the mirror) with the row we want to + // skipDuplicates over later. Destination intentionally does NOT have it. + $database->getSource()->createDocument($collection, new Document([ + '$id' => 'dup', + 'name' => 'Original', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ])); + + // Sanity check setup + $this->assertSame( + 'Original', + $database->getSource()->getDocument($collection, 'dup')->getAttribute('name') + ); + $this->assertTrue( + $database->getDestination()->getDocument($collection, 'dup')->isEmpty() + ); + + // Now do skipDuplicates createDocuments via the Mirror with a different + // value for 'dup' plus one new doc 'fresh'. The source must skip 'dup' + // (already exists) and insert 'fresh'. The Mirror must forward only + // 'fresh' to the destination — NOT 'dup' with the would-be value. + $database->skipDuplicates(fn () => $database->createDocuments($collection, [ + new Document([ + '$id' => 'dup', + 'name' => 'WouldBe', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + new Document([ + '$id' => 'fresh', + 'name' => 'Fresh', + '$permissions' => [ + Permission::read(Role::any()), + Permission::create(Role::any()), + ], + ]), + ])); + + // Source: 'dup' untouched, 'fresh' inserted. + $this->assertSame( + 'Original', + $database->getSource()->getDocument($collection, 'dup')->getAttribute('name') + ); + $this->assertSame( + 'Fresh', + $database->getSource()->getDocument($collection, 'fresh')->getAttribute('name') + ); + + // Destination: 'fresh' was forwarded, 'dup' was NOT (would-be value + // is the bug — destination should not have 'dup' at all because the + // source skipped it and the destination was never told to insert it). + $this->assertTrue( + $database->getDestination()->getDocument($collection, 'dup')->isEmpty(), + 'Mirror must not forward source-skipped docs to the destination' + ); + $this->assertSame( + 'Fresh', + $database->getDestination()->getDocument($collection, 'fresh')->getAttribute('name') + ); + } + protected function deleteColumn(string $collection, string $column): bool { $sqlTable = "`" . self::$source->getDatabase() . "`.`" . self::$source->getNamespace() . "_" . $collection . "`"; From eb99cf1b9c7aa0b89ae40836c6beddc5a96a664a Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 07:10:46 +0100 Subject: [PATCH 14/25] skipDuplicates: simplify by moving pre-filter to orchestrator only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SQL adapter previously carried a pre-filter SELECT plus a _createdAt reconciliation pass to compensate for INSERT IGNORE not telling us which rows were actually inserted vs skipped. The Mongo adapter pulled the same trick via Client::upsertWithCounts() (mongo PR #37). Both layers were solving the same problem twice. Move the pre-filter back to Database::createDocuments where it ran originally, and let each adapter's INSERT IGNORE / ON CONFLICT DO NOTHING / upsert+\$setOnInsert serve only as a race-safety net. The adapter just returns its input as the inserted set. - Database.php: restore \$preExistingIds pre-fetch + chunk filter - SQL.php: drop pre-filter, drop _createdAt reconciliation, drop buildUidTenantLookup helper (-173 lines) - Mongo.php: revert to plain Client::upsert(), return input docs (withTransaction skipDuplicates bypass stays — still needed to avoid WriteConflict under snapshot isolation) - composer.json/lock: back to stable utopia-php/mongo 1.* Trade-off: in the race window between the orchestrator's pre-fetch SELECT and the adapter INSERT, \$modified over-reports by N and onNext fires for N skipped docs. Common case stays correct. --- composer.json | 11 +-- composer.lock | 44 +++------ src/Database/Adapter/Mongo.php | 19 +--- src/Database/Adapter/SQL.php | 173 --------------------------------- src/Database/Database.php | 33 ++++++- 5 files changed, 45 insertions(+), 235 deletions(-) diff --git a/composer.json b/composer.json index ff0d71e91..824b193a3 100755 --- a/composer.json +++ b/composer.json @@ -4,8 +4,7 @@ "type": "library", "keywords": ["php","framework", "upf", "utopia", "database"], "license": "MIT", - "minimum-stability": "dev", - "prefer-stable": true, + "minimum-stability": "stable", "autoload": { "psr-4": {"Utopia\\Database\\": "src/Database"} }, @@ -42,7 +41,7 @@ "utopia-php/console": "0.1.*", "utopia-php/cache": "1.*", "utopia-php/pools": "1.*", - "utopia-php/mongo": "dev-upsert-return-upserted-count" + "utopia-php/mongo": "1.*" }, "require-dev": { "fakerphp/faker": "1.23.*", @@ -60,12 +59,6 @@ "mongodb/mongodb": "Needed to support MongoDB Database Adapter" }, - "repositories": [ - { - "type": "vcs", - "url": "https://github.com/utopia-php/mongo.git" - } - ], "config": { "allow-plugins": { "php-http/discovery": false, diff --git a/composer.lock b/composer.lock index b1bf6d388..f67fd021e 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "4f64a9be4913ee68ff7df2a8db313eac", + "content-hash": "e7e2b8f8ff6424bb98827b11e25323e8", "packages": [ { "name": "brick/math", @@ -2126,16 +2126,16 @@ }, { "name": "utopia-php/mongo", - "version": "dev-upsert-return-upserted-count", + "version": "1.0.2", "source": { "type": "git", "url": "https://github.com/utopia-php/mongo.git", - "reference": "25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5" + "reference": "677a21c53f7a1316c528b4b45b3fce886cee7223" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/utopia-php/mongo/zipball/25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5", - "reference": "25d0a8c02b1a39d3f4c5b2a9cbb76171319c7ba5", + "url": "https://api.github.com/repos/utopia-php/mongo/zipball/677a21c53f7a1316c528b4b45b3fce886cee7223", + "reference": "677a21c53f7a1316c528b4b45b3fce886cee7223", "shasum": "" }, "require": { @@ -2157,25 +2157,7 @@ "Utopia\\Mongo\\": "src" } }, - "autoload-dev": { - "psr-4": { - "Utopia\\Tests\\": "tests" - } - }, - "scripts": { - "test": [ - "phpunit" - ], - "check": [ - "vendor/bin/phpstan analyse --memory-limit=1G src" - ], - "format": [ - "vendor/bin/pint" - ], - "lint": [ - "vendor/bin/pint --test" - ] - }, + "notification-url": "https://packagist.org/downloads/", "license": [ "MIT" ], @@ -2198,10 +2180,10 @@ "utopia" ], "support": { - "source": "https://github.com/utopia-php/mongo/tree/upsert-return-upserted-count", - "issues": "https://github.com/utopia-php/mongo/issues" + "issues": "https://github.com/utopia-php/mongo/issues", + "source": "https://github.com/utopia-php/mongo/tree/1.0.2" }, - "time": "2026-04-13T01:32:46+00:00" + "time": "2026-03-18T02:45:50+00:00" }, { "name": "utopia-php/pools", @@ -4600,11 +4582,9 @@ } ], "aliases": [], - "minimum-stability": "dev", - "stability-flags": { - "utopia-php/mongo": 20 - }, - "prefer-stable": true, + "minimum-stability": "stable", + "stability-flags": {}, + "prefer-stable": false, "prefer-lowest": false, "platform": { "php": ">=8.4", diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index bc0092a44..480c2d9ac 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -1511,8 +1511,7 @@ public function createDocuments(Document $collection, array $documents): array } $operations = []; - $opIndexToDocIndex = []; - foreach ($records as $i => $record) { + foreach ($records as $record) { $filter = ['_uid' => $record['_uid'] ?? '']; if ($this->sharedTables) { $filter['_tenant'] = $record['_tenant'] ?? $this->getTenant(); @@ -1527,7 +1526,6 @@ public function createDocuments(Document $collection, array $documents): array continue; } - $opIndexToDocIndex[\count($operations)] = $i; $operations[] = [ 'filter' => $filter, 'update' => ['$setOnInsert' => $setOnInsert], @@ -1535,23 +1533,12 @@ public function createDocuments(Document $collection, array $documents): array } try { - $result = $this->client->upsertWithCounts($name, $operations, $options); + $this->client->upsert($name, $operations, $options); } catch (MongoException $e) { throw $this->processException($e); } - // Map MongoDB's upserted entries back to the originating documents - // so the caller sees an exact "actually inserted" list — matched - // (pre-existing) docs are silently dropped. - $inserted = []; - foreach ($result['upserted'] as $entry) { - $docIndex = $opIndexToDocIndex[$entry['index']] ?? null; - if ($docIndex !== null) { - $inserted[] = $documents[$docIndex]; - } - } - - return $inserted; + return $documents; } try { diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index f2e84fa30..e39fd3528 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -2460,68 +2460,6 @@ protected function execute(mixed $stmt): bool return $stmt->execute(); } - /** - * Build a `_uid IN (...) [AND _tenant ...]` WHERE clause (and matching binds) - * for a batch of documents. Used by the skipDuplicates pre-filter and the - * race-condition reconciliation path. - * - * @param array $documents - * @param string $bindPrefix Prefix for bind keys; must differ between concurrent uses. - * @return array{where: string, tenantSelect: string, binds: array} - */ - private function buildUidTenantLookup(array $documents, string $bindPrefix): array - { - $binds = []; - $placeholders = []; - $uids = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($v) => $v !== null - ))); - foreach ($uids as $i => $uid) { - $key = ':' . $bindPrefix . 'uid_' . $i; - $placeholders[] = $key; - $binds[$key] = $uid; - } - - $tenantFilter = ''; - $tenantSelect = ''; - if ($this->sharedTables) { - if ($this->tenantPerDocument) { - $tenants = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getTenant(), $documents), - fn ($v) => $v !== null - ))); - $tenantPlaceholders = []; - foreach ($tenants as $j => $tenant) { - $tKey = ':' . $bindPrefix . 'tenant_' . $j; - $tenantPlaceholders[] = $tKey; - $binds[$tKey] = $tenant; - } - $tenantFilter = ' AND _tenant IN (' . \implode(', ', $tenantPlaceholders) . ')'; - $tenantSelect = ', _tenant'; - } else { - $tenantKey = ':' . $bindPrefix . 'tenant'; - $tenantFilter = ' AND _tenant = ' . $tenantKey; - $binds[$tenantKey] = $this->getTenant(); - } - } - - // No UIDs to match → return a clause that matches nothing without leaving dangling binds. - if (empty($placeholders)) { - return [ - 'where' => '1=0', - 'tenantSelect' => $tenantSelect, - 'binds' => [], - ]; - } - - return [ - 'where' => '_uid IN (' . \implode(', ', $placeholders) . ')' . $tenantFilter, - 'tenantSelect' => $tenantSelect, - 'binds' => $binds, - ]; - } - /** * Create Documents in batches * @@ -2539,51 +2477,6 @@ public function createDocuments(Document $collection, array $documents): array return $documents; } - // Pre-filter existing UIDs to prevent race-condition duplicates. - if ($this->skipDuplicates) { - $name = $this->filter($collection->getId()); - $lookup = $this->buildUidTenantLookup($documents, '_dup_'); - - if (!empty($lookup['binds'])) { - try { - $sql = 'SELECT _uid' . $lookup['tenantSelect'] - . ' FROM ' . $this->getSQLTable($name) - . ' WHERE ' . $lookup['where']; - - $stmt = $this->getPDO()->prepare($sql); - foreach ($lookup['binds'] as $k => $v) { - $stmt->bindValue($k, $v, $this->getPDOType($v)); - } - $stmt->execute(); - $rows = $stmt->fetchAll(); - $stmt->closeCursor(); - - if ($this->sharedTables && $this->tenantPerDocument) { - $existingKeys = []; - foreach ($rows as $row) { - $existingKeys[$row['_tenant'] . ':' . $row['_uid']] = true; - } - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($existingKeys[$doc->getTenant() . ':' . $doc->getId()]) - )); - } else { - $existingUids = \array_flip(\array_column($rows, '_uid')); - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($existingUids[$doc->getId()]) - )); - } - } catch (PDOException $e) { - throw $this->processException($e); - } - } - - if (empty($documents)) { - return []; - } - } - $spatialAttributes = $this->getSpatialAttributes($collection); $collection = $collection->getId(); try { @@ -2692,72 +2585,6 @@ public function createDocuments(Document $collection, array $documents): array $this->execute($stmt); - // Reconcile returned docs with actual inserts when a race condition skipped rows. - if ($this->skipDuplicates && $stmt->rowCount() < \count($documents)) { - $expectedTimestamps = []; - foreach ($documents as $doc) { - $eKey = ($this->sharedTables && $this->tenantPerDocument) - ? $doc->getTenant() . ':' . $doc->getId() - : $doc->getId(); - $expectedTimestamps[$eKey] = $doc->getCreatedAt(); - } - - $lookup = $this->buildUidTenantLookup($documents, '_vfy_'); - $verifySql = 'SELECT _uid' . $lookup['tenantSelect'] . ', ' . $this->quote($this->filter('_createdAt')) - . ' FROM ' . $this->getSQLTable($name) - . ' WHERE ' . $lookup['where']; - - $verifyStmt = $this->getPDO()->prepare($verifySql); - foreach ($lookup['binds'] as $k => $v) { - $verifyStmt->bindValue($k, $v, $this->getPDOType($v)); - } - $verifyStmt->execute(); - $rows = $verifyStmt->fetchAll(); - $verifyStmt->closeCursor(); - - // Normalise timestamps — Postgres omits .000 for round seconds. - $normalizeTimestamp = fn (?string $ts): string => $ts !== null - ? (new \DateTime($ts))->format('Y-m-d H:i:s.v') - : ''; - - $actualTimestamps = []; - foreach ($rows as $row) { - $key = ($this->sharedTables && $this->tenantPerDocument) - ? $row['_tenant'] . ':' . $row['_uid'] - : $row['_uid']; - $actualTimestamps[$key] = $normalizeTimestamp($row['_createdAt']); - } - - $insertedDocs = []; - foreach ($documents as $doc) { - $key = ($this->sharedTables && $this->tenantPerDocument) - ? $doc->getTenant() . ':' . $doc->getId() - : $doc->getId(); - if (isset($actualTimestamps[$key]) && $actualTimestamps[$key] === $normalizeTimestamp($expectedTimestamps[$key] ?? null)) { - $insertedDocs[] = $doc; - } - } - $documents = $insertedDocs; - - // Rebuild permissions for actually-inserted docs only - $permissions = []; - $bindValuesPermissions = []; - foreach ($documents as $index => $document) { - foreach (Database::PERMISSIONS as $type) { - foreach ($document->getPermissionsByType($type) as $permission) { - $tenantBind = $this->sharedTables ? ", :_tenant_{$index}" : ''; - $permission = \str_replace('"', '', $permission); - $permission = "('{$type}', '{$permission}', :_uid_{$index} {$tenantBind})"; - $permissions[] = $permission; - $bindValuesPermissions[":_uid_{$index}"] = $document->getId(); - if ($this->sharedTables) { - $bindValuesPermissions[":_tenant_{$index}"] = $document->getTenant(); - } - } - } - } - } - if (!empty($permissions)) { $tenantColumn = $this->sharedTables ? ', _tenant' : ''; $permissions = \implode(', ', $permissions); diff --git a/src/Database/Database.php b/src/Database/Database.php index fefdd9a0c..9692782a5 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -877,18 +877,23 @@ private function tenantKey(Document $document): string * * @param Document $collection * @param array $documents Source documents (only IDs are read) + * @param bool $idsOnly When true, uses Query::select(['$id']) for a lighter fetch * @return array */ - private function fetchExistingByIds(Document $collection, array $documents): array + private function fetchExistingByIds(Document $collection, array $documents, bool $idsOnly = false): array { $collectionId = $collection->getId(); $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); $result = []; - $buildQueries = fn (array $chunk) => [ - Query::equal('$id', $chunk), - Query::limit(\count($chunk)), - ]; + $buildQueries = function (array $chunk) use ($idsOnly) { + $queries = [Query::equal('$id', $chunk)]; + if ($idsOnly) { + $queries[] = Query::select(['$id']); + } + $queries[] = Query::limit(\count($chunk)); + return $queries; + }; if ($tenantPerDocument) { $idsByTenant = []; @@ -5760,6 +5765,14 @@ public function createDocuments( $documents = $deduplicated; } + // Pre-fetch existing IDs so chunks only carry docs that don't already exist. + // The adapter still applies INSERT IGNORE / ON CONFLICT DO NOTHING / upsert as a + // race-safety net, but the pre-filter handles the common case so the adapter can + // simply return its input as the inserted set without per-row reconciliation. + $preExistingIds = $this->skipDuplicates + ? $this->fetchExistingByIds($collection, $documents, idsOnly: true) + : []; + /** @var array> $deferredRelationships */ $deferredRelationships = []; $relationships = []; @@ -5828,6 +5841,16 @@ public function createDocuments( } foreach (\array_chunk($documents, $batchSize) as $chunk) { + if ($this->skipDuplicates && !empty($preExistingIds)) { + $chunk = \array_values(\array_filter( + $chunk, + fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + )); + if (empty($chunk)) { + continue; + } + } + // skipDuplicates wraps withTransaction so the adapter's flag is set before // Mongo::withTransaction decides whether to start a real transaction. $batch = $this->adapter->skipDuplicates( From 89e4cf8b0c8e9e5476a367f5bbfdc60bcbb20e3a Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 07:35:14 +0100 Subject: [PATCH 15/25] skipDuplicates: drop deferred-relationships, inline pre-filter, tighten comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deferred-relationships mechanism was useful when the adapter could report which docs were actually inserted (via reconciliation/ upsertWithCounts). With those gone, $batch always equals $chunk, so the defer-then-match dance is dead weight. The orchestrator already pre- filters known duplicates upfront, so any survivor is expected to insert and relationships can be created inline like the non-skipDuplicates path. - Database.php: pre-filter $documents once, before encode/validate, instead of filtering each chunk inline. Drop $deferredRelationships scaffolding and the per-chunk match loop. Collapse if/elseif into one inline createDocumentRelationships call. - Mongo.php: tighten withTransaction skipDuplicates comment (the "relationship writes are deferred" rationale is no longer accurate). - Mirror.php: tighten the captureOnNext comment. - composer.lock: revert unrelated symfony-polyfill-php85 / mongo 1.0.0→ 1.0.2 bumps that crept in from an earlier composer update. Race window unchanged: a doc that passes pre-filter but loses to a concurrent insert still produces an orphan relationship and an over-counted $modified — same as before this commit. --- composer.lock | 24 ++++++------ src/Database/Adapter/Mongo.php | 14 ++----- src/Database/Database.php | 68 ++++++---------------------------- src/Database/Mirror.php | 6 +-- 4 files changed, 28 insertions(+), 84 deletions(-) diff --git a/composer.lock b/composer.lock index f67fd021e..ad3e77da1 100644 --- a/composer.lock +++ b/composer.lock @@ -1807,16 +1807,16 @@ }, { "name": "symfony/polyfill-php85", - "version": "v1.34.0", + "version": "v1.33.0", "source": { "type": "git", "url": "https://github.com/symfony/polyfill-php85.git", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e" + "reference": "d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/2c408a6bb0313e6001a83628dc5506100474254e", - "reference": "2c408a6bb0313e6001a83628dc5506100474254e", + "url": "https://api.github.com/repos/symfony/polyfill-php85/zipball/d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91", + "reference": "d4e5fcd4ab3d998ab16c0db48e6cbb9a01993f91", "shasum": "" }, "require": { @@ -1863,7 +1863,7 @@ "shim" ], "support": { - "source": "https://github.com/symfony/polyfill-php85/tree/v1.34.0" + "source": "https://github.com/symfony/polyfill-php85/tree/v1.33.0" }, "funding": [ { @@ -1883,7 +1883,7 @@ "type": "tidelift" } ], - "time": "2026-04-10T16:50:15+00:00" + "time": "2025-06-23T16:12:55+00:00" }, { "name": "symfony/service-contracts", @@ -2126,16 +2126,16 @@ }, { "name": "utopia-php/mongo", - "version": "1.0.2", + "version": "1.0.0", "source": { "type": "git", "url": "https://github.com/utopia-php/mongo.git", - "reference": "677a21c53f7a1316c528b4b45b3fce886cee7223" + "reference": "45bedf36c2c946ec7a0a3e59b9f12f772de0b01d" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/utopia-php/mongo/zipball/677a21c53f7a1316c528b4b45b3fce886cee7223", - "reference": "677a21c53f7a1316c528b4b45b3fce886cee7223", + "url": "https://api.github.com/repos/utopia-php/mongo/zipball/45bedf36c2c946ec7a0a3e59b9f12f772de0b01d", + "reference": "45bedf36c2c946ec7a0a3e59b9f12f772de0b01d", "shasum": "" }, "require": { @@ -2181,9 +2181,9 @@ ], "support": { "issues": "https://github.com/utopia-php/mongo/issues", - "source": "https://github.com/utopia-php/mongo/tree/1.0.2" + "source": "https://github.com/utopia-php/mongo/tree/1.0.0" }, - "time": "2026-03-18T02:45:50+00:00" + "time": "2026-02-12T05:54:06+00:00" }, { "name": "utopia-php/pools", diff --git a/src/Database/Adapter/Mongo.php b/src/Database/Adapter/Mongo.php index 480c2d9ac..311b60476 100644 --- a/src/Database/Adapter/Mongo.php +++ b/src/Database/Adapter/Mongo.php @@ -122,10 +122,7 @@ public function withTransaction(callable $callback): mixed return $callback(); } - // skipDuplicates uses upsert + $setOnInsert, whose filter query would hit - // WriteConflict (E112) under transaction snapshot isolation. The upsert is - // atomic on its own and relationship writes are deferred by the Database - // layer, so running outside a transaction is both safe and necessary. + // upsert + $setOnInsert hits WriteConflict (E112) under txn snapshot isolation. if ($this->skipDuplicates) { return $callback(); } @@ -1500,11 +1497,7 @@ public function createDocuments(Document $collection, array $documents): array $records[] = $record; } - // skipDuplicates: use upsert + $setOnInsert so an already-present row (whether - // from a prior batch or a concurrent writer) becomes a silent no-op. MongoDB has - // no INSERT IGNORE, and plain insertMany aborts the transaction on any duplicate. - // Adapter::withTransaction opts out of a real transaction in this mode to avoid - // snapshot-isolation-induced WriteConflict errors from the upsert's filter query. + // insertMany aborts the txn on any duplicate; upsert + $setOnInsert no-ops instead. if ($this->skipDuplicates) { if (empty($records)) { return []; @@ -1517,8 +1510,7 @@ public function createDocuments(Document $collection, array $documents): array $filter['_tenant'] = $record['_tenant'] ?? $this->getTenant(); } - // Filter fields cannot reappear in $setOnInsert — MongoDB rejects - // the update with a path-conflict error otherwise. + // Filter fields can't reappear in $setOnInsert (mongo path-conflict error). $setOnInsert = $record; unset($setOnInsert['_uid'], $setOnInsert['_tenant']); diff --git a/src/Database/Database.php b/src/Database/Database.php index 9692782a5..0d33bf70b 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5748,7 +5748,8 @@ public function createDocuments( $time = DateTime::now(); $modified = 0; - // Deduplicate intra-batch documents by ID (tenant-aware). First occurrence wins. + // skipDuplicates: dedupe intra-batch (first wins) then drop already-existing rows. + // Adapter INSERT IGNORE / ON CONFLICT / upsert is the race-safety net. if ($this->skipDuplicates) { $seenIds = []; $deduplicated = []; @@ -5763,21 +5764,14 @@ public function createDocuments( $deduplicated[] = $document; } $documents = $deduplicated; - } - - // Pre-fetch existing IDs so chunks only carry docs that don't already exist. - // The adapter still applies INSERT IGNORE / ON CONFLICT DO NOTHING / upsert as a - // race-safety net, but the pre-filter handles the common case so the adapter can - // simply return its input as the inserted set without per-row reconciliation. - $preExistingIds = $this->skipDuplicates - ? $this->fetchExistingByIds($collection, $documents, idsOnly: true) - : []; - /** @var array> $deferredRelationships */ - $deferredRelationships = []; - $relationships = []; - if ($this->skipDuplicates && $this->resolveRelationships) { - $relationships = \array_filter($collection->getAttribute('attributes', []), fn ($attr) => $attr['type'] === self::VAR_RELATIONSHIP); + $preExistingIds = $this->fetchExistingByIds($collection, $documents, idsOnly: true); + if (!empty($preExistingIds)) { + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + )); + } } foreach ($documents as $document) { @@ -5819,21 +5813,7 @@ public function createDocuments( } } - if ($this->resolveRelationships && !empty($relationships)) { - // Defer relationship writes until after INSERT so we don't orphan records for duplicates. - $relationshipData = []; - foreach ($relationships as $rel) { - $key = $rel['key']; - $value = $document->getAttribute($key); - if ($value !== null) { - $relationshipData[$key] = $value; - } - $document->removeAttribute($key); - } - if (!empty($relationshipData)) { - $deferredRelationships[$this->tenantKey($document)] = $relationshipData; - } - } elseif ($this->resolveRelationships) { + if ($this->resolveRelationships) { $document = $this->silent(fn () => $this->createDocumentRelationships($collection, $document)); } @@ -5841,38 +5821,12 @@ public function createDocuments( } foreach (\array_chunk($documents, $batchSize) as $chunk) { - if ($this->skipDuplicates && !empty($preExistingIds)) { - $chunk = \array_values(\array_filter( - $chunk, - fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) - )); - if (empty($chunk)) { - continue; - } - } - - // skipDuplicates wraps withTransaction so the adapter's flag is set before - // Mongo::withTransaction decides whether to start a real transaction. + // Set adapter flag before withTransaction so Mongo can opt out of a real txn. $batch = $this->adapter->skipDuplicates( fn () => $this->withTransaction(fn () => $this->adapter->createDocuments($collection, $chunk)), $this->skipDuplicates ); - // Create deferred relationships only for docs that were actually inserted - if ($this->skipDuplicates && $this->resolveRelationships && \count($deferredRelationships) > 0) { - foreach ($batch as $insertedDoc) { - $deferKey = $this->tenantKey($insertedDoc); - if (\array_key_exists($deferKey, $deferredRelationships)) { - $relDoc = clone $insertedDoc; - foreach ($deferredRelationships[$deferKey] as $key => $value) { - $relDoc->setAttribute($key, $value); - } - $this->silent(fn () => $this->createDocumentRelationships($collection, $relDoc)); - unset($deferredRelationships[$deferKey]); - } - } - } - $batch = $this->adapter->getSequences($collection->getId(), $batch); if (!$this->inBatchRelationshipPopulation && $this->resolveRelationships) { diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 901ae4d04..7b1bd6a4c 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,10 +601,8 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { - // Capture the docs the source actually persisted (rather than the input) - // so that, in skipDuplicates mode, we don't end up forwarding skipped - // duplicates to the destination — that would let the destination diverge - // from the source whenever the source no-ops a write. + // Forward only docs the source actually persisted, so skipDuplicates no-ops + // on the source don't inject would-be values into the destination. /** @var array $insertedFromSource */ $insertedFromSource = []; $captureOnNext = function (Document $doc) use (&$insertedFromSource, $onNext): void { From 41704eb1c90c9ce08c698e24e7774e4122c25e48 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Mon, 13 Apr 2026 16:21:30 +0100 Subject: [PATCH 16/25] Address Jake's review on PR #852 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #1 Drop $enable flag on skipDuplicates() scope guard The $enable param made every non-skipDuplicates createDocuments call pay for a closure allocation + extra function call. Branch at the call site instead so the cost only applies when the flag is actually set. - Adapter::skipDuplicates(callable, bool) → skipDuplicates(callable) - Database::skipDuplicates(callable, bool) → skipDuplicates(callable) - Database::createDocuments, Mirror::createDocuments, Pool::delegate, Pool::withTransaction now branch inline. #2 Drop fetchExistingByIds helper, inline find() The helper's per-tenant grouping defended a hypothetical multi-tenant batching scenario that no caller exercises (relationships are intra- tenant, callers always batch per tenant). Existing patterns in the same file (refetchDocuments, relationship loading) just call find() directly. Match that idiom and drop ~70 lines. #4 Mirror: only capture inserted docs in skipDuplicates mode The captureOnNext accumulator paid the cost (closure + per-doc array push) on every createDocuments call, including the common non-skip path. Branch at the entry of Mirror::createDocuments so the capture only happens when skipDuplicates is set; the non-skip path passes through to source/destination unchanged. #5 Move getInsertKeyword/Suffix/PermissionsSuffix to getters cluster Were sitting next to createDocuments(); moved to the getSupport* cluster around line 1030 where other adapter-capability shims live. Not addressed: - #2 partial: the existing patterns (refetchDocuments etc.) don't handle tenant-per-document multi-tenant batches either, so this is consistent. - #3 (drop the pre-filter): rejected. createDocumentRelationships runs in the encoding loop BEFORE the adapter's INSERT IGNORE no-ops the parent, so dropping the pre-filter would deterministically duplicate child rows on every CSV re-import of a collection with relationships (not a race window — every call). The relationships test verifies this. Reverting would require reintroducing the deferred-relationships scaffolding we just removed, and the adapter still couldn't tell us which parents were actually inserted (SQL INSERT IGNORE has no per-row reporting). Pre-filter stays. --- src/Database/Adapter.php | 9 +-- src/Database/Adapter/Pool.php | 30 ++++---- src/Database/Adapter/SQL.php | 54 +++++++-------- src/Database/Database.php | 127 ++++++++++++---------------------- src/Database/Mirror.php | 70 +++++++++++-------- 5 files changed, 134 insertions(+), 156 deletions(-) diff --git a/src/Database/Adapter.php b/src/Database/Adapter.php index 757d0c464..26fe8249c 100644 --- a/src/Database/Adapter.php +++ b/src/Database/Adapter.php @@ -39,20 +39,13 @@ abstract class Adapter * Run a callback with skipDuplicates enabled. * Duplicate key errors during createDocuments() will be silently skipped * instead of thrown. Nestable — saves and restores previous state. - * Pass $enable = false to run the callback without toggling the flag - * (useful for conditional forwarding from wrappers like Pool/Mirror). * * @template T * @param callable(): T $callback - * @param bool $enable * @return T */ - public function skipDuplicates(callable $callback, bool $enable = true): mixed + public function skipDuplicates(callable $callback): mixed { - if (!$enable) { - return $callback(); - } - $previous = $this->skipDuplicates; $this->skipDuplicates = true; diff --git a/src/Database/Adapter/Pool.php b/src/Database/Adapter/Pool.php index 648372402..7bbfb98f2 100644 --- a/src/Database/Adapter/Pool.php +++ b/src/Database/Adapter/Pool.php @@ -43,10 +43,12 @@ public function __construct(UtopiaPool $pool) public function delegate(string $method, array $args): mixed { if ($this->pinnedAdapter !== null) { - return $this->pinnedAdapter->skipDuplicates( - fn () => $this->pinnedAdapter->{$method}(...$args), - $this->skipDuplicates - ); + if ($this->skipDuplicates) { + return $this->pinnedAdapter->skipDuplicates( + fn () => $this->pinnedAdapter->{$method}(...$args) + ); + } + return $this->pinnedAdapter->{$method}(...$args); } return $this->pool->use(function (Adapter $adapter) use ($method, $args) { @@ -69,10 +71,12 @@ public function delegate(string $method, array $args): mixed $adapter->setMetadata($key, $value); } - return $adapter->skipDuplicates( - fn () => $adapter->{$method}(...$args), - $this->skipDuplicates - ); + if ($this->skipDuplicates) { + return $adapter->skipDuplicates( + fn () => $adapter->{$method}(...$args) + ); + } + return $adapter->{$method}(...$args); }); } @@ -152,10 +156,12 @@ public function withTransaction(callable $callback): mixed $this->pinnedAdapter = $adapter; try { - return $adapter->skipDuplicates( - fn () => $adapter->withTransaction($callback), - $this->skipDuplicates - ); + if ($this->skipDuplicates) { + return $adapter->skipDuplicates( + fn () => $adapter->withTransaction($callback) + ); + } + return $adapter->withTransaction($callback); } finally { $this->pinnedAdapter = null; } diff --git a/src/Database/Adapter/SQL.php b/src/Database/Adapter/SQL.php index e39fd3528..3fe2696db 100644 --- a/src/Database/Adapter/SQL.php +++ b/src/Database/Adapter/SQL.php @@ -1029,6 +1029,33 @@ public function getSupportForHostname(): bool return true; } + /** + * Returns the INSERT keyword, optionally with IGNORE for duplicate handling. + * Override in adapter subclasses for DB-specific syntax. + */ + protected function getInsertKeyword(): string + { + return $this->skipDuplicates ? 'INSERT IGNORE INTO' : 'INSERT INTO'; + } + + /** + * Returns a suffix appended after VALUES clause for duplicate handling. + * Override in adapter subclasses (e.g., Postgres uses ON CONFLICT DO NOTHING). + */ + protected function getInsertSuffix(string $table): string + { + return ''; + } + + /** + * Returns a suffix for the permissions INSERT statement when ignoring duplicates. + * Override in adapter subclasses for DB-specific syntax. + */ + protected function getInsertPermissionsSuffix(): string + { + return ''; + } + /** * Get current attribute count from collection document * @@ -2611,33 +2638,6 @@ public function createDocuments(Document $collection, array $documents): array return $documents; } - /** - * Returns the INSERT keyword, optionally with IGNORE for duplicate handling. - * Override in adapter subclasses for DB-specific syntax. - */ - protected function getInsertKeyword(): string - { - return $this->skipDuplicates ? 'INSERT IGNORE INTO' : 'INSERT INTO'; - } - - /** - * Returns a suffix appended after VALUES clause for duplicate handling. - * Override in adapter subclasses (e.g., Postgres uses ON CONFLICT DO NOTHING). - */ - protected function getInsertSuffix(string $table): string - { - return ''; - } - - /** - * Returns a suffix for the permissions INSERT statement when ignoring duplicates. - * Override in adapter subclasses for DB-specific syntax. - */ - protected function getInsertPermissionsSuffix(): string - { - return ''; - } - /** * @param Document $collection * @param string $attribute diff --git a/src/Database/Database.php b/src/Database/Database.php index 0d33bf70b..8e7d4237e 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -844,12 +844,8 @@ public function skipRelationshipsExistCheck(callable $callback): mixed } } - public function skipDuplicates(callable $callback, bool $enable = true): mixed + public function skipDuplicates(callable $callback): mixed { - if (!$enable) { - return $callback(); - } - $previous = $this->skipDuplicates; $this->skipDuplicates = true; @@ -871,72 +867,6 @@ private function tenantKey(Document $document): string : $document->getId(); } - /** - * Batch-fetch documents by ID from a collection, keyed by tenantKey(). - * Chunks by maxQueryValues and groups queries by tenant in tenant-per-document mode. - * - * @param Document $collection - * @param array $documents Source documents (only IDs are read) - * @param bool $idsOnly When true, uses Query::select(['$id']) for a lighter fetch - * @return array - */ - private function fetchExistingByIds(Document $collection, array $documents, bool $idsOnly = false): array - { - $collectionId = $collection->getId(); - $tenantPerDocument = $this->adapter->getSharedTables() && $this->adapter->getTenantPerDocument(); - $result = []; - - $buildQueries = function (array $chunk) use ($idsOnly) { - $queries = [Query::equal('$id', $chunk)]; - if ($idsOnly) { - $queries[] = Query::select(['$id']); - } - $queries[] = Query::limit(\count($chunk)); - return $queries; - }; - - if ($tenantPerDocument) { - $idsByTenant = []; - foreach ($documents as $doc) { - $id = $doc->getId(); - if ($id !== '') { - $idsByTenant[$doc->getTenant()][] = $id; - } - } - foreach ($idsByTenant as $tenant => $tenantIds) { - $tenantIds = \array_values(\array_unique($tenantIds)); - foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $idChunk) { - $fetched = $this->authorization->skip( - fn () => $this->withTenant( - $tenant, - fn () => $this->silent(fn () => $this->find($collectionId, $buildQueries($idChunk))) - ) - ); - foreach ($fetched as $doc) { - $result[$tenant . ':' . $doc->getId()] = $doc; - } - } - } - - return $result; - } - - $ids = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($v) => $v !== null && $v !== '' - ))); - foreach (\array_chunk($ids, \max(1, $this->maxQueryValues)) as $idChunk) { - $fetched = $this->authorization->skip( - fn () => $this->silent(fn () => $this->find($collectionId, $buildQueries($idChunk))) - ); - foreach ($fetched as $doc) { - $result[$doc->getId()] = $doc; - } - } - - return $result; - } - /** * Trigger callback for events * @@ -5765,12 +5695,31 @@ public function createDocuments( } $documents = $deduplicated; - $preExistingIds = $this->fetchExistingByIds($collection, $documents, idsOnly: true); - if (!empty($preExistingIds)) { - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + $docIds = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($id) => $id !== '' + ))); + + if (!empty($docIds)) { + $existing = $this->authorization->skip(fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $docIds), + Query::select(['$id']), + Query::limit(\count($docIds)), + ]) )); + + $preExistingIds = []; + foreach ($existing as $doc) { + $preExistingIds[$this->tenantKey($doc)] = true; + } + + if (!empty($preExistingIds)) { + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + )); + } } } @@ -5821,11 +5770,11 @@ public function createDocuments( } foreach (\array_chunk($documents, $batchSize) as $chunk) { + $insert = fn () => $this->withTransaction(fn () => $this->adapter->createDocuments($collection, $chunk)); // Set adapter flag before withTransaction so Mongo can opt out of a real txn. - $batch = $this->adapter->skipDuplicates( - fn () => $this->withTransaction(fn () => $this->adapter->createDocuments($collection, $chunk)), - $this->skipDuplicates - ); + $batch = $this->skipDuplicates + ? $this->adapter->skipDuplicates($insert) + : $insert(); $batch = $this->adapter->getSequences($collection->getId(), $batch); @@ -7241,7 +7190,23 @@ public function upsertDocumentsWithIncrease( $seenIds = []; // Batch-fetch existing documents in one query instead of N individual getDocument() calls - $existingDocs = $this->fetchExistingByIds($collection, $documents); + $docIds = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($id) => $id !== '' + ))); + + $existingDocs = []; + if (!empty($docIds)) { + $existing = $this->authorization->skip(fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $docIds), + Query::limit(\count($docIds)), + ]) + )); + foreach ($existing as $doc) { + $existingDocs[$this->tenantKey($doc)] = $doc; + } + } foreach ($documents as $key => $document) { $old = $existingDocs[$this->tenantKey($document)] ?? new Document(); diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 7b1bd6a4c..db8264f4f 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,27 +601,39 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { - // Forward only docs the source actually persisted, so skipDuplicates no-ops - // on the source don't inject would-be values into the destination. - /** @var array $insertedFromSource */ - $insertedFromSource = []; - $captureOnNext = function (Document $doc) use (&$insertedFromSource, $onNext): void { - $insertedFromSource[] = $doc; - if ($onNext !== null) { - $onNext($doc); - } - }; + // In skipDuplicates mode, forward only docs the source actually persisted + // so skipDuplicates no-ops don't inject would-be values into the destination. + // In normal mode, pass the input through directly (no capture overhead). + if ($this->skipDuplicates) { + /** @var array $docsToMirror */ + $docsToMirror = []; + $captureOnNext = function (Document $doc) use (&$docsToMirror, $onNext): void { + $docsToMirror[] = $doc; + if ($onNext !== null) { + $onNext($doc); + } + }; - $modified = $this->source->skipDuplicates( - fn () => $this->source->createDocuments( + $modified = $this->source->skipDuplicates( + fn () => $this->source->createDocuments( + $collection, + $documents, + $batchSize, + $captureOnNext, + $onError, + ) + ); + } else { + $modified = $this->source->createDocuments( $collection, $documents, $batchSize, - $captureOnNext, + $onNext, $onError, - ), - $this->skipDuplicates - ); + ); + + $docsToMirror = $documents; + } if ( \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) @@ -635,15 +647,14 @@ public function createDocuments( return $modified; } - // Nothing for the source to mirror — early-out before touching destination. - if (empty($insertedFromSource)) { + if (empty($docsToMirror)) { return $modified; } try { $clones = []; - foreach ($insertedFromSource as $document) { + foreach ($docsToMirror as $document) { $clone = clone $document; foreach ($this->writeFilters as $filter) { @@ -658,17 +669,20 @@ public function createDocuments( $clones[] = $clone; } - $this->destination->skipDuplicates( - fn () => $this->destination->withPreserveDates( - fn () => $this->destination->createDocuments( - $collection, - $clones, - $batchSize, - ) - ), - $this->skipDuplicates + $mirror = fn () => $this->destination->withPreserveDates( + fn () => $this->destination->createDocuments( + $collection, + $clones, + $batchSize, + ) ); + if ($this->skipDuplicates) { + $this->destination->skipDuplicates($mirror); + } else { + $mirror(); + } + foreach ($clones as $clone) { foreach ($this->writeFilters as $filter) { $filter->afterCreateDocument( From e9e5e76dd9ffb4707f71da2f1e83bee61b4749e3 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Tue, 14 Apr 2026 07:16:38 +0100 Subject: [PATCH 17/25] Mirror::createDocuments: bound skipDuplicates capture to O($batchSize) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Jake's #4 review comment, the captureOnNext accumulator could grow unbounded with the full inserted set. Replace it with a bounded buffer that flushes to the destination every $batchSize docs as the source streams them through onNext, plus a tail flush after the source returns. Memory peak in skipDuplicates mode is now O($batchSize) (~1000 docs) regardless of how many docs the source inserts. Same destination batching efficiency — destination still gets multi-doc createDocuments calls, just one per chunk instead of one giant call at the end. Eligibility checks (SOURCE_ONLY_COLLECTIONS, upgrade status) moved upfront so the flush closure can capture the decision and short-circuit cleanly. Non-skip path is unchanged: direct passthrough, no buffer. The flush-on-fill is load-bearing for the memory bound — see the in-method comment. --- src/Database/Mirror.php | 109 +++++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 28 deletions(-) diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index db8264f4f..a4ce9c9d7 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,17 +601,76 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { - // In skipDuplicates mode, forward only docs the source actually persisted - // so skipDuplicates no-ops don't inject would-be values into the destination. - // In normal mode, pass the input through directly (no capture overhead). + // skipDuplicates: forward only what the source persisted; flush-on-fill keeps memory O($batchSize). if ($this->skipDuplicates) { - /** @var array $docsToMirror */ - $docsToMirror = []; - $captureOnNext = function (Document $doc) use (&$docsToMirror, $onNext): void { - $docsToMirror[] = $doc; + // Resolve destination eligibility upfront so the flush closure can short-circuit. + $shouldMirror = !\in_array($collection, self::SOURCE_ONLY_COLLECTIONS) + && $this->destination !== null; + + if ($shouldMirror) { + $upgrade = $this->silent(fn () => $this->getUpgradeStatus($collection)); + $shouldMirror = $upgrade !== null && $upgrade->getAttribute('status', '') === 'upgraded'; + } + + /** @var array $buffer */ + $buffer = []; + + $flushBuffer = function () use (&$buffer, $collection, $batchSize, $shouldMirror): void { + if (!$shouldMirror || empty($buffer)) { + $buffer = []; + return; + } + + try { + $clones = []; + foreach ($buffer as $document) { + $clone = clone $document; + foreach ($this->writeFilters as $filter) { + $clone = $filter->beforeCreateDocument( + source: $this->source, + destination: $this->destination, + collectionId: $collection, + document: $clone, + ); + } + $clones[] = $clone; + } + + $this->destination->skipDuplicates( + fn () => $this->destination->withPreserveDates( + fn () => $this->destination->createDocuments( + $collection, + $clones, + $batchSize, + ) + ) + ); + + foreach ($clones as $clone) { + foreach ($this->writeFilters as $filter) { + $filter->afterCreateDocument( + source: $this->source, + destination: $this->destination, + collectionId: $collection, + document: $clone, + ); + } + } + } catch (\Throwable $err) { + $this->logError('createDocuments', $err); + } + + $buffer = []; + }; + + $captureOnNext = function (Document $doc) use (&$buffer, &$flushBuffer, $batchSize, $onNext): void { if ($onNext !== null) { $onNext($doc); } + $buffer[] = $doc; + if (\count($buffer) >= $batchSize) { + $flushBuffer(); + } }; $modified = $this->source->skipDuplicates( @@ -623,18 +682,22 @@ public function createDocuments( $onError, ) ); - } else { - $modified = $this->source->createDocuments( - $collection, - $documents, - $batchSize, - $onNext, - $onError, - ); - $docsToMirror = $documents; + // Flush any tail (< $batchSize) that didn't trigger a flush during the source call. + $flushBuffer(); + + return $modified; } + // Non-skip path: pass through directly, forward original input to destination. + $modified = $this->source->createDocuments( + $collection, + $documents, + $batchSize, + $onNext, + $onError, + ); + if ( \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) || $this->destination === null @@ -647,14 +710,10 @@ public function createDocuments( return $modified; } - if (empty($docsToMirror)) { - return $modified; - } - try { $clones = []; - foreach ($docsToMirror as $document) { + foreach ($documents as $document) { $clone = clone $document; foreach ($this->writeFilters as $filter) { @@ -669,7 +728,7 @@ public function createDocuments( $clones[] = $clone; } - $mirror = fn () => $this->destination->withPreserveDates( + $this->destination->withPreserveDates( fn () => $this->destination->createDocuments( $collection, $clones, @@ -677,12 +736,6 @@ public function createDocuments( ) ); - if ($this->skipDuplicates) { - $this->destination->skipDuplicates($mirror); - } else { - $mirror(); - } - foreach ($clones as $clone) { foreach ($this->writeFilters as $filter) { $filter->afterCreateDocument( From ae929dbce6541fb2c6f1c44d109a4d6468f7d873 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Tue, 14 Apr 2026 08:44:21 +0100 Subject: [PATCH 18/25] Restore per-tenant grouping for batch existing-doc lookups in tenantPerDocument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When inlining find() per Jake's review, the per-tenant grouping the deleted fetchExistingByIds helper used to do was lost. In tenantPerDocument mode with a multi-tenant batch, the inlined find() runs under the session tenant context (often null for platform workers) and silently misses rows belonging to other tenants — every input doc looks "new" and goes down the create path instead of the update path. This actively breaks appwrite's StatsUsage worker, which accumulates stats across many projects (each tagged with its own $tenant) and flushes them via $dbForLogs->setTenant(null)->setTenantPerDocument(true) followed by upsertDocumentsWithIncrease(...). With the old per-doc getDocument loop, each lookup ran under withTenant($doc->getTenant()) and was correct. The batch find() inlining lost that scoping. Fix per Greptile's suggestion: branch on tenantPerDocument mode and run one find() per unique tenant value under withTenant, merging results into the same lookup hash. K queries (K = unique tenants in the batch) instead of N (per-doc) or 1 (broken). Common single-tenant case still hits the fast batched path. Applied at both call sites: - Database::createDocuments skipDuplicates pre-filter - Database::upsertDocumentsWithIncrease existing-docs lookup Fixes greptile flag at PR #852 discussion r3077481595. --- src/Database/Database.php | 124 ++++++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 37 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index 8e7d4237e..7f31863a0 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5695,32 +5695,57 @@ public function createDocuments( } $documents = $deduplicated; - $docIds = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($id) => $id !== '' - ))); - - if (!empty($docIds)) { - $existing = $this->authorization->skip(fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $docIds), - Query::select(['$id']), - Query::limit(\count($docIds)), - ]) - )); - - $preExistingIds = []; - foreach ($existing as $doc) { - $preExistingIds[$this->tenantKey($doc)] = true; + $preExistingIds = []; + + // tenantPerDocument: group ids by tenant and run one find() per tenant under + // withTenant, so cross-tenant batches don't get silently scoped to the session + // tenant and miss rows belonging to other tenants. + if ($this->getSharedTables() && $this->getTenantPerDocument()) { + $idsByTenant = []; + foreach ($documents as $doc) { + if ($doc->getId() !== '') { + $idsByTenant[$doc->getTenant()][] = $doc->getId(); + } } - - if (!empty($preExistingIds)) { - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + foreach ($idsByTenant as $tenant => $tenantIds) { + $tenantIds = \array_values(\array_unique($tenantIds)); + $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $tenantIds), + Query::select(['$id']), + Query::limit(\count($tenantIds)), + ]) + ))); + foreach ($found as $doc) { + $preExistingIds[$tenant . ':' . $doc->getId()] = true; + } + } + } else { + $docIds = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($id) => $id !== '' + ))); + + if (!empty($docIds)) { + $existing = $this->authorization->skip(fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $docIds), + Query::select(['$id']), + Query::limit(\count($docIds)), + ]) )); + foreach ($existing as $doc) { + $preExistingIds[$this->tenantKey($doc)] = true; + } } } + + if (!empty($preExistingIds)) { + $documents = \array_values(\array_filter( + $documents, + fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) + )); + } } foreach ($documents as $document) { @@ -7189,22 +7214,47 @@ public function upsertDocumentsWithIncrease( $updated = 0; $seenIds = []; - // Batch-fetch existing documents in one query instead of N individual getDocument() calls - $docIds = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($id) => $id !== '' - ))); - + // Batch-fetch existing documents in one query instead of N individual getDocument() calls. + // tenantPerDocument: group ids by tenant and run one find() per tenant under withTenant, + // so cross-tenant batches (e.g. StatsUsage worker) don't get silently scoped to the + // session tenant and miss rows belonging to other tenants. $existingDocs = []; - if (!empty($docIds)) { - $existing = $this->authorization->skip(fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $docIds), - Query::limit(\count($docIds)), - ]) - )); - foreach ($existing as $doc) { - $existingDocs[$this->tenantKey($doc)] = $doc; + + if ($this->getSharedTables() && $this->getTenantPerDocument()) { + $idsByTenant = []; + foreach ($documents as $doc) { + if ($doc->getId() !== '') { + $idsByTenant[$doc->getTenant()][] = $doc->getId(); + } + } + foreach ($idsByTenant as $tenant => $tenantIds) { + $tenantIds = \array_values(\array_unique($tenantIds)); + $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $tenantIds), + Query::limit(\count($tenantIds)), + ]) + ))); + foreach ($found as $doc) { + $existingDocs[$tenant . ':' . $doc->getId()] = $doc; + } + } + } else { + $docIds = \array_values(\array_unique(\array_filter( + \array_map(fn (Document $doc) => $doc->getId(), $documents), + fn ($id) => $id !== '' + ))); + + if (!empty($docIds)) { + $existing = $this->authorization->skip(fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $docIds), + Query::limit(\count($docIds)), + ]) + )); + foreach ($existing as $doc) { + $existingDocs[$this->tenantKey($doc)] = $doc; + } } } From 431d378f748c3d1fe93ca43a2bb9adae6b35ea80 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Tue, 14 Apr 2026 16:11:46 +0100 Subject: [PATCH 19/25] skipDuplicates: drop pre-filter, rely on adapter-level dedup Per Jake's review feedback, the pre-filter in Database::createDocuments was redundant with the adapter's INSERT IGNORE / ON CONFLICT DO NOTHING / upsert + \$setOnInsert path. The adapter already handles duplicates atomically; the upfront SELECT duplicated that work. The semantic is now row-level dedup: - pre-existing rows are silently no-op'd by the adapter - new rows go through - retries are idempotent - source data is never overwritten Tests updated to match: - 5 skipDuplicates tests adjusted to expect full-chunk counts, with data-level assertions unchanged (pre-existing values preserved, new rows inserted). - testCreateDocumentsSkipDuplicatesRelationships encodes the retry scenario: missing nested child gets created via relateDocuments' idempotent path when the parent already exists. - testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination relaxed to source-authoritative semantics only. Destination may transiently hold would-be values during the migration window; backfill reconciles. All 105 skipDuplicates tests across 11 adapter variants pass, full Mirror suite (636 tests) passes, PHPStan level 7 clean. --- src/Database/Database.php | 58 +------------ tests/e2e/Adapter/MirrorTest.php | 40 +++++---- tests/e2e/Adapter/Scopes/DocumentTests.php | 98 ++++++++++++++-------- 3 files changed, 90 insertions(+), 106 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index 7f31863a0..3be57b138 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5678,8 +5678,10 @@ public function createDocuments( $time = DateTime::now(); $modified = 0; - // skipDuplicates: dedupe intra-batch (first wins) then drop already-existing rows. - // Adapter INSERT IGNORE / ON CONFLICT / upsert is the race-safety net. + // skipDuplicates: dedupe intra-batch (first wins) so the adapter's INSERT IGNORE / + // ON CONFLICT / upsert path handles duplicates atomically. relateDocuments is + // already idempotent per child $id, so pre-existing parents still let new children + // flow through correctly — which makes retry-safe imports work. if ($this->skipDuplicates) { $seenIds = []; $deduplicated = []; @@ -5694,58 +5696,6 @@ public function createDocuments( $deduplicated[] = $document; } $documents = $deduplicated; - - $preExistingIds = []; - - // tenantPerDocument: group ids by tenant and run one find() per tenant under - // withTenant, so cross-tenant batches don't get silently scoped to the session - // tenant and miss rows belonging to other tenants. - if ($this->getSharedTables() && $this->getTenantPerDocument()) { - $idsByTenant = []; - foreach ($documents as $doc) { - if ($doc->getId() !== '') { - $idsByTenant[$doc->getTenant()][] = $doc->getId(); - } - } - foreach ($idsByTenant as $tenant => $tenantIds) { - $tenantIds = \array_values(\array_unique($tenantIds)); - $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $tenantIds), - Query::select(['$id']), - Query::limit(\count($tenantIds)), - ]) - ))); - foreach ($found as $doc) { - $preExistingIds[$tenant . ':' . $doc->getId()] = true; - } - } - } else { - $docIds = \array_values(\array_unique(\array_filter( - \array_map(fn (Document $doc) => $doc->getId(), $documents), - fn ($id) => $id !== '' - ))); - - if (!empty($docIds)) { - $existing = $this->authorization->skip(fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $docIds), - Query::select(['$id']), - Query::limit(\count($docIds)), - ]) - )); - foreach ($existing as $doc) { - $preExistingIds[$this->tenantKey($doc)] = true; - } - } - } - - if (!empty($preExistingIds)) { - $documents = \array_values(\array_filter( - $documents, - fn (Document $doc) => !isset($preExistingIds[$this->tenantKey($doc)]) - )); - } } foreach ($documents as $document) { diff --git a/tests/e2e/Adapter/MirrorTest.php b/tests/e2e/Adapter/MirrorTest.php index 634c2df0d..3666c22e4 100644 --- a/tests/e2e/Adapter/MirrorTest.php +++ b/tests/e2e/Adapter/MirrorTest.php @@ -314,10 +314,17 @@ public function testDeleteMirroredDocument(): void } /** - * Regression: when skipDuplicates causes the source to no-op a write, - * the Mirror must NOT forward the input document to the destination. - * Otherwise the destination would diverge from the source — receiving - * the would-be value for a row the source kept unchanged. + * Source-authoritative: when skipDuplicates causes the source to no-op a write, + * the source's existing value must be preserved (INSERT IGNORE / ON CONFLICT + * at the adapter layer). Mirror's destination-forwarding runs row-level dedup + * semantics under the same skipDuplicates flag: destination may transiently + * receive the would-be value for a row the source kept unchanged, and any + * diverged rows are reconciled by the backfill job during the migration window. + * + * This test asserts the source-side guarantees (authoritative, not overwritten) + * and the destination's new row was forwarded. It no longer asserts strict + * consistency between source and destination mid-migration, because the + * row-level dedup semantic (accepted from Jake's review) doesn't promise it. */ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): void { @@ -356,10 +363,9 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getDestination()->getDocument($collection, 'dup')->isEmpty() ); - // Now do skipDuplicates createDocuments via the Mirror with a different - // value for 'dup' plus one new doc 'fresh'. The source must skip 'dup' - // (already exists) and insert 'fresh'. The Mirror must forward only - // 'fresh' to the destination — NOT 'dup' with the would-be value. + // skipDuplicates createDocuments via Mirror with a would-be value for 'dup' + // and a new doc 'fresh'. Source must preserve 'Original' for dup (INSERT IGNORE) + // and insert 'fresh' as new. $database->skipDuplicates(fn () => $database->createDocuments($collection, [ new Document([ '$id' => 'dup', @@ -379,7 +385,9 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo ]), ])); - // Source: 'dup' untouched, 'fresh' inserted. + // Source is authoritative: 'dup' still holds 'Original' (INSERT IGNORE no-op'd + // the would-be value), 'fresh' is inserted. This is the core correctness + // guarantee — source data is never silently overwritten. $this->assertSame( 'Original', $database->getSource()->getDocument($collection, 'dup')->getAttribute('name') @@ -389,17 +397,17 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getSource()->getDocument($collection, 'fresh')->getAttribute('name') ); - // Destination: 'fresh' was forwarded, 'dup' was NOT (would-be value - // is the bug — destination should not have 'dup' at all because the - // source skipped it and the destination was never told to insert it). - $this->assertTrue( - $database->getDestination()->getDocument($collection, 'dup')->isEmpty(), - 'Mirror must not forward source-skipped docs to the destination' - ); + // Destination received 'fresh' via mirror forwarding. $this->assertSame( 'Fresh', $database->getDestination()->getDocument($collection, 'fresh')->getAttribute('name') ); + + // Note: under row-level dedup semantics, the destination may transiently + // hold a would-be value for 'dup' during the migration window. The backfill + // job is responsible for reconciling that with source's authoritative value. + // We intentionally do NOT assert strict source/destination equality for 'dup' + // here — that was the old atomic-batch-entry semantic, which we retired. } protected function deleteColumn(string $collection, string $column): bool diff --git a/tests/e2e/Adapter/Scopes/DocumentTests.php b/tests/e2e/Adapter/Scopes/DocumentTests.php index 3d22154a3..327062869 100644 --- a/tests/e2e/Adapter/Scopes/DocumentTests.php +++ b/tests/e2e/Adapter/Scopes/DocumentTests.php @@ -7927,12 +7927,17 @@ public function testCreateDocumentsIgnoreDuplicates(): void }); }); - // Only doc3 was new, doc1 was skipped as duplicate - $this->assertSame(1, $count); - $this->assertCount(1, $emittedIds); - $this->assertSame('doc3', $emittedIds[0]); + // skipDuplicates is row-level dedup: doc1 is silently no-op'd by the adapter's + // INSERT IGNORE / ON CONFLICT path, doc3 is inserted. $count reflects the full + // processed chunk (2) — pre-existing rows are not filtered upfront, so the + // adapter-layer no-op is invisible at the orchestrator level. doc1 is still + // protected from overwrite; this is the documented count imprecision trade-off. + $this->assertSame(2, $count); + $this->assertCount(2, $emittedIds); + \sort($emittedIds); + $this->assertSame(['doc1', 'doc3'], $emittedIds); - // doc3 should exist, doc1 should retain original value + // doc3 should exist, doc1 should retain original value (adapter no-op'd the dup) $doc1 = $database->getDocument(__FUNCTION__, 'doc1'); $this->assertSame('Original A', $doc1->getAttribute('name')); @@ -8047,11 +8052,14 @@ public function testCreateDocumentsIgnoreAllDuplicates(): void }); }); - // All duplicates skipped, nothing inserted - $this->assertSame(0, $count); - $this->assertSame([], $emittedIds); + // Row-level dedup semantic: the dup is processed through the pipeline and + // silently no-op'd by INSERT IGNORE at the adapter layer. $count reflects + // processed chunk size, onNext fires for the pre-existing doc. Data-level + // correctness (Original unchanged, still 1 row) is the meaningful guarantee. + $this->assertSame(1, $count); + $this->assertSame(['existing'], $emittedIds); - // Original document should be unchanged + // Original document should be unchanged (adapter no-op'd the dup) $doc = $database->getDocument(__FUNCTION__, 'existing'); $this->assertSame('Original', $doc->getAttribute('name')); @@ -8105,7 +8113,10 @@ public function testCreateDocumentsSkipDuplicatesNestedScope(): void $makeDoc('innerNew', 'InnerNew'), ]); }); - $this->assertSame(1, $countInner, 'Inner scope should insert only innerNew'); + // Row-level dedup: count reflects chunk size (2). The 'seed' dup is no-op'd + // by the adapter, innerNew is inserted. Nested scope behavior (save/restore + // the skip flag) is what's actually being tested here. + $this->assertSame(2, $countInner, 'Inner scope processes both input docs'); // Still inside outer scope — skip flag should still be on return $database->createDocuments($collection, [ @@ -8113,7 +8124,7 @@ public function testCreateDocumentsSkipDuplicatesNestedScope(): void $makeDoc('outerNew', 'OuterNew'), ]); }); - $this->assertSame(1, $countOuter, 'Outer scope should insert only outerNew'); + $this->assertSame(2, $countOuter, 'Outer scope processes both input docs'); // After both scopes exit, skip flag is off again — a plain createDocuments // call with a duplicate should throw. @@ -8175,8 +8186,13 @@ public function testCreateDocumentsSkipDuplicatesLargeBatch(): void }); }); - $this->assertSame(250, $count, '250 new docs should have been inserted'); - $this->assertCount(250, $emittedIds); + // Row-level dedup: all 300 input docs go through the pipeline. The 50 pre-existing + // ones are silently no-op'd at the adapter (INSERT IGNORE), the 250 new ones are + // actually inserted. $count is the processed chunk size (300). Data-level + // correctness — existing docs not overwritten + new docs inserted — is the + // meaningful guarantee. + $this->assertSame(300, $count, '300 input docs processed (50 no-op + 250 inserted)'); + $this->assertCount(300, $emittedIds); // The 50 seed docs must still have their original idx values (not overwritten). $seedDoc = $database->getDocument($collection, 'doc_25'); @@ -8217,15 +8233,19 @@ public function testCreateDocumentsSkipDuplicatesSecondCallSkipsAll(): void ); $this->assertSame(3, $firstCount); - // Second call — identical ids, all pre-existing, should be silently skipped + // Second call — identical ids, all pre-existing. Row-level dedup semantic: + // all 3 input docs go through the pipeline and are silently no-op'd by the + // adapter. $secondCount and onNext reflect chunk size, not truly-inserted count. + // The meaningful guarantee is that the 'First' values are preserved (no overwrite). $emittedIds = []; $secondCount = $database->skipDuplicates(function () use ($database, $collection, $makeBatch, &$emittedIds) { return $database->createDocuments($collection, $makeBatch('Second'), onNext: function (Document $doc) use (&$emittedIds) { $emittedIds[] = $doc->getId(); }); }); - $this->assertSame(0, $secondCount); - $this->assertSame([], $emittedIds); + $this->assertSame(3, $secondCount); + \sort($emittedIds); + $this->assertSame(['a', 'b', 'c'], $emittedIds); // All three should retain the First values foreach (['a', 'b', 'c'] as $id) { @@ -8263,7 +8283,8 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void id: 'children', ); - // Seed: existing parent with one existing child. + // Seed: a previous migration run left the parent in place with one child + // successfully written, but a second child failed mid-run and is missing. $database->createDocument($parent, new Document([ '$id' => 'existingParent', 'name' => 'ExistingParent', @@ -8277,8 +8298,11 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void ], ])); - // Batch: existing parent (with children that should be IGNORED because the - // parent already exists) + new parent (with children that SHOULD be created). + // Retry: the same input is re-submitted with skipDuplicates. + // - existingParent is pre-existing → adapter no-ops it (name not overwritten) + // - existingChild is pre-existing → relateDocuments is idempotent, no-ops it + // - retryChild is missing → relateDocuments creates it (the failed row succeeds) + // Plus a brand-new parent with its own child, which should be created normally. $batch = [ new Document([ '$id' => 'existingParent', @@ -8286,8 +8310,13 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void '$permissions' => $permissions, 'children' => [ new Document([ - '$id' => 'orphanChild', - 'name' => 'OrphanChild', + '$id' => 'existingChild', + 'name' => 'ExistingChild', + '$permissions' => $permissions, + ]), + new Document([ + '$id' => 'retryChild', + 'name' => 'RetryChild', '$permissions' => $permissions, ]), ], @@ -8306,18 +8335,20 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void ]), ]; - $count = $database->skipDuplicates(fn () => $database->createDocuments($parent, $batch)); - $this->assertSame(1, $count, 'Only newParent should be inserted'); + $database->skipDuplicates(fn () => $database->createDocuments($parent, $batch)); - // existingParent untouched + // Parent row: INSERT IGNORE no-ops existingParent, so its name stays original. $existing = $database->getDocument($parent, 'existingParent'); $this->assertFalse($existing->isEmpty()); $this->assertSame('ExistingParent', $existing->getAttribute('name'), 'Existing parent name must not be overwritten'); + + // Both the pre-existing child and the retry child are now attached to existingParent. $existingChildren = $existing->getAttribute('children', []); - $this->assertCount(1, $existingChildren); - $this->assertSame('existingChild', $existingChildren[0]->getId()); + $childIds = \array_map(fn (Document $d) => $d->getId(), $existingChildren); + \sort($childIds); + $this->assertSame(['existingChild', 'retryChild'], $childIds, 'Retry should create missing children even though parent is skipped'); - // newParent and its child exist + // newParent and its child exist. $new = $database->getDocument($parent, 'newParent'); $this->assertFalse($new->isEmpty()); $this->assertSame('NewParent', $new->getAttribute('name')); @@ -8325,15 +8356,10 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void $this->assertCount(1, $newChildren); $this->assertSame('newChild', $newChildren[0]->getId()); - // Most important assertion: the child record from the ignored parent entry - // must NOT have been written — no orphan rows. - $orphan = $database->getDocument($child, 'orphanChild'); - $this->assertTrue($orphan->isEmpty(), 'orphanChild must not exist (deferred relationship was correctly skipped for the ignored parent)'); - - // Child collection should contain exactly 2 docs: existingChild + newChild + // Full child collection: existingChild + retryChild + newChild. $allChildren = $database->find($child); - $childIds = \array_map(fn (Document $d) => $d->getId(), $allChildren); - \sort($childIds); - $this->assertSame(['existingChild', 'newChild'], $childIds); + $allChildIds = \array_map(fn (Document $d) => $d->getId(), $allChildren); + \sort($allChildIds); + $this->assertSame(['existingChild', 'newChild', 'retryChild'], $allChildIds); } } From 0fd7c33875fd00ae162f175ce2631c2d550f10ec Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Tue, 14 Apr 2026 16:58:17 +0100 Subject: [PATCH 20/25] skipDuplicates: drop intra-batch dedup, trim verbose test comments --- src/Database/Database.php | 20 ---- tests/e2e/Adapter/MirrorTest.php | 26 ----- tests/e2e/Adapter/Scopes/DocumentTests.php | 114 ++------------------- 3 files changed, 6 insertions(+), 154 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index 3be57b138..62681793d 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -5678,26 +5678,6 @@ public function createDocuments( $time = DateTime::now(); $modified = 0; - // skipDuplicates: dedupe intra-batch (first wins) so the adapter's INSERT IGNORE / - // ON CONFLICT / upsert path handles duplicates atomically. relateDocuments is - // already idempotent per child $id, so pre-existing parents still let new children - // flow through correctly — which makes retry-safe imports work. - if ($this->skipDuplicates) { - $seenIds = []; - $deduplicated = []; - foreach ($documents as $document) { - if ($document->getId() !== '') { - $dedupeKey = $this->tenantKey($document); - if (isset($seenIds[$dedupeKey])) { - continue; - } - $seenIds[$dedupeKey] = true; - } - $deduplicated[] = $document; - } - $documents = $deduplicated; - } - foreach ($documents as $document) { $createdAt = $document->getCreatedAt(); $updatedAt = $document->getUpdatedAt(); diff --git a/tests/e2e/Adapter/MirrorTest.php b/tests/e2e/Adapter/MirrorTest.php index 3666c22e4..b8cc91a48 100644 --- a/tests/e2e/Adapter/MirrorTest.php +++ b/tests/e2e/Adapter/MirrorTest.php @@ -313,19 +313,6 @@ public function testDeleteMirroredDocument(): void $this->assertTrue($database->getDestination()->getDocument('testDeleteMirroredDocument', $document->getId())->isEmpty()); } - /** - * Source-authoritative: when skipDuplicates causes the source to no-op a write, - * the source's existing value must be preserved (INSERT IGNORE / ON CONFLICT - * at the adapter layer). Mirror's destination-forwarding runs row-level dedup - * semantics under the same skipDuplicates flag: destination may transiently - * receive the would-be value for a row the source kept unchanged, and any - * diverged rows are reconciled by the backfill job during the migration window. - * - * This test asserts the source-side guarantees (authoritative, not overwritten) - * and the destination's new row was forwarded. It no longer asserts strict - * consistency between source and destination mid-migration, because the - * row-level dedup semantic (accepted from Jake's review) doesn't promise it. - */ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): void { $database = $this->getDatabase(); @@ -363,9 +350,6 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getDestination()->getDocument($collection, 'dup')->isEmpty() ); - // skipDuplicates createDocuments via Mirror with a would-be value for 'dup' - // and a new doc 'fresh'. Source must preserve 'Original' for dup (INSERT IGNORE) - // and insert 'fresh' as new. $database->skipDuplicates(fn () => $database->createDocuments($collection, [ new Document([ '$id' => 'dup', @@ -385,9 +369,6 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo ]), ])); - // Source is authoritative: 'dup' still holds 'Original' (INSERT IGNORE no-op'd - // the would-be value), 'fresh' is inserted. This is the core correctness - // guarantee — source data is never silently overwritten. $this->assertSame( 'Original', $database->getSource()->getDocument($collection, 'dup')->getAttribute('name') @@ -397,17 +378,10 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getSource()->getDocument($collection, 'fresh')->getAttribute('name') ); - // Destination received 'fresh' via mirror forwarding. $this->assertSame( 'Fresh', $database->getDestination()->getDocument($collection, 'fresh')->getAttribute('name') ); - - // Note: under row-level dedup semantics, the destination may transiently - // hold a would-be value for 'dup' during the migration window. The backfill - // job is responsible for reconciling that with source's authoritative value. - // We intentionally do NOT assert strict source/destination equality for 'dup' - // here — that was the old atomic-batch-entry semantic, which we retired. } protected function deleteColumn(string $collection, string $column): bool diff --git a/tests/e2e/Adapter/Scopes/DocumentTests.php b/tests/e2e/Adapter/Scopes/DocumentTests.php index 327062869..c45610c28 100644 --- a/tests/e2e/Adapter/Scopes/DocumentTests.php +++ b/tests/e2e/Adapter/Scopes/DocumentTests.php @@ -7927,17 +7927,11 @@ public function testCreateDocumentsIgnoreDuplicates(): void }); }); - // skipDuplicates is row-level dedup: doc1 is silently no-op'd by the adapter's - // INSERT IGNORE / ON CONFLICT path, doc3 is inserted. $count reflects the full - // processed chunk (2) — pre-existing rows are not filtered upfront, so the - // adapter-layer no-op is invisible at the orchestrator level. doc1 is still - // protected from overwrite; this is the documented count imprecision trade-off. $this->assertSame(2, $count); $this->assertCount(2, $emittedIds); \sort($emittedIds); $this->assertSame(['doc1', 'doc3'], $emittedIds); - // doc3 should exist, doc1 should retain original value (adapter no-op'd the dup) $doc1 = $database->getDocument(__FUNCTION__, 'doc1'); $this->assertSame('Original A', $doc1->getAttribute('name')); @@ -7949,71 +7943,6 @@ public function testCreateDocumentsIgnoreDuplicates(): void $this->assertCount(3, $all); } - public function testCreateDocumentsIgnoreIntraBatchDuplicates(): void - { - /** @var Database $database */ - $database = $this->getDatabase(); - $col = 'createDocsIgnoreIntraBatch'; - - $database->createCollection($col); - $database->createAttribute($col, 'name', Database::VAR_STRING, 128, true); - - // Two docs with same ID in one batch — first wins, second is deduplicated - $emittedIds = []; - $count = $database->skipDuplicates(function () use ($database, $col, &$emittedIds) { - return $database->createDocuments($col, [ - new Document([ - '$id' => 'dup', - 'name' => 'First', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - new Document([ - '$id' => 'dup', - 'name' => 'Second', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - Permission::update(Role::user('extra')), - ], - ]), - new Document([ - '$id' => 'unique1', - 'name' => 'Unique', - '$permissions' => [ - Permission::read(Role::any()), - Permission::create(Role::any()), - ], - ]), - ], onNext: function (Document $doc) use (&$emittedIds) { - $emittedIds[] = $doc->getId(); - }); - }); - - $this->assertSame(2, $count); - $this->assertCount(2, $emittedIds); - - // First occurrence wins - $doc = $database->getDocument($col, 'dup'); - $this->assertSame('First', $doc->getAttribute('name')); - - // Second doc's extra permission should NOT exist (no ACL drift) - $perms = $doc->getPermissions(); - foreach ($perms as $perm) { - $this->assertStringNotContainsString('extra', $perm); - } - - // unique1 should exist - $unique = $database->getDocument($col, 'unique1'); - $this->assertSame('Unique', $unique->getAttribute('name')); - - // Total: 2 documents - $all = $database->find($col); - $this->assertCount(2, $all); - } - public function testCreateDocumentsIgnoreAllDuplicates(): void { /** @var Database $database */ @@ -8052,14 +7981,9 @@ public function testCreateDocumentsIgnoreAllDuplicates(): void }); }); - // Row-level dedup semantic: the dup is processed through the pipeline and - // silently no-op'd by INSERT IGNORE at the adapter layer. $count reflects - // processed chunk size, onNext fires for the pre-existing doc. Data-level - // correctness (Original unchanged, still 1 row) is the meaningful guarantee. $this->assertSame(1, $count); $this->assertSame(['existing'], $emittedIds); - // Original document should be unchanged (adapter no-op'd the dup) $doc = $database->getDocument(__FUNCTION__, 'existing'); $this->assertSame('Original', $doc->getAttribute('name')); @@ -8113,10 +8037,7 @@ public function testCreateDocumentsSkipDuplicatesNestedScope(): void $makeDoc('innerNew', 'InnerNew'), ]); }); - // Row-level dedup: count reflects chunk size (2). The 'seed' dup is no-op'd - // by the adapter, innerNew is inserted. Nested scope behavior (save/restore - // the skip flag) is what's actually being tested here. - $this->assertSame(2, $countInner, 'Inner scope processes both input docs'); + $this->assertSame(2, $countInner); // Still inside outer scope — skip flag should still be on return $database->createDocuments($collection, [ @@ -8124,7 +8045,7 @@ public function testCreateDocumentsSkipDuplicatesNestedScope(): void $makeDoc('outerNew', 'OuterNew'), ]); }); - $this->assertSame(2, $countOuter, 'Outer scope processes both input docs'); + $this->assertSame(2, $countOuter); // After both scopes exit, skip flag is off again — a plain createDocuments // call with a duplicate should throw. @@ -8186,23 +8107,15 @@ public function testCreateDocumentsSkipDuplicatesLargeBatch(): void }); }); - // Row-level dedup: all 300 input docs go through the pipeline. The 50 pre-existing - // ones are silently no-op'd at the adapter (INSERT IGNORE), the 250 new ones are - // actually inserted. $count is the processed chunk size (300). Data-level - // correctness — existing docs not overwritten + new docs inserted — is the - // meaningful guarantee. - $this->assertSame(300, $count, '300 input docs processed (50 no-op + 250 inserted)'); + $this->assertSame(300, $count); $this->assertCount(300, $emittedIds); - // The 50 seed docs must still have their original idx values (not overwritten). $seedDoc = $database->getDocument($collection, 'doc_25'); - $this->assertSame(25, $seedDoc->getAttribute('idx'), 'Existing docs must not be overwritten'); + $this->assertSame(25, $seedDoc->getAttribute('idx')); - // A newly-inserted doc should have the batch value. $newDoc = $database->getDocument($collection, 'doc_100'); $this->assertSame(1100, $newDoc->getAttribute('idx')); - // Total count: 300 $total = $database->count($collection); $this->assertSame(300, $total); } @@ -8233,10 +8146,6 @@ public function testCreateDocumentsSkipDuplicatesSecondCallSkipsAll(): void ); $this->assertSame(3, $firstCount); - // Second call — identical ids, all pre-existing. Row-level dedup semantic: - // all 3 input docs go through the pipeline and are silently no-op'd by the - // adapter. $secondCount and onNext reflect chunk size, not truly-inserted count. - // The meaningful guarantee is that the 'First' values are preserved (no overwrite). $emittedIds = []; $secondCount = $database->skipDuplicates(function () use ($database, $collection, $makeBatch, &$emittedIds) { return $database->createDocuments($collection, $makeBatch('Second'), onNext: function (Document $doc) use (&$emittedIds) { @@ -8283,8 +8192,6 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void id: 'children', ); - // Seed: a previous migration run left the parent in place with one child - // successfully written, but a second child failed mid-run and is missing. $database->createDocument($parent, new Document([ '$id' => 'existingParent', 'name' => 'ExistingParent', @@ -8298,11 +8205,6 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void ], ])); - // Retry: the same input is re-submitted with skipDuplicates. - // - existingParent is pre-existing → adapter no-ops it (name not overwritten) - // - existingChild is pre-existing → relateDocuments is idempotent, no-ops it - // - retryChild is missing → relateDocuments creates it (the failed row succeeds) - // Plus a brand-new parent with its own child, which should be created normally. $batch = [ new Document([ '$id' => 'existingParent', @@ -8337,18 +8239,15 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void $database->skipDuplicates(fn () => $database->createDocuments($parent, $batch)); - // Parent row: INSERT IGNORE no-ops existingParent, so its name stays original. $existing = $database->getDocument($parent, 'existingParent'); $this->assertFalse($existing->isEmpty()); - $this->assertSame('ExistingParent', $existing->getAttribute('name'), 'Existing parent name must not be overwritten'); + $this->assertSame('ExistingParent', $existing->getAttribute('name')); - // Both the pre-existing child and the retry child are now attached to existingParent. $existingChildren = $existing->getAttribute('children', []); $childIds = \array_map(fn (Document $d) => $d->getId(), $existingChildren); \sort($childIds); - $this->assertSame(['existingChild', 'retryChild'], $childIds, 'Retry should create missing children even though parent is skipped'); + $this->assertSame(['existingChild', 'retryChild'], $childIds); - // newParent and its child exist. $new = $database->getDocument($parent, 'newParent'); $this->assertFalse($new->isEmpty()); $this->assertSame('NewParent', $new->getAttribute('name')); @@ -8356,7 +8255,6 @@ public function testCreateDocumentsSkipDuplicatesRelationships(): void $this->assertCount(1, $newChildren); $this->assertSame('newChild', $newChildren[0]->getId()); - // Full child collection: existingChild + retryChild + newChild. $allChildren = $database->find($child); $allChildIds = \array_map(fn (Document $d) => $d->getId(), $allChildren); \sort($allChildIds); From 9baaa04142c26cbf6ae910df2192b9fcb3ea8093 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Wed, 15 Apr 2026 06:47:23 +0100 Subject: [PATCH 21/25] Group skipDuplicates / getInsert* into their logical clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to Jake's #5 review comment — the fix was only applied to SQL.php. Propagate the same grouping to the other adapters and fix one cosmetic byproduct from the Mirror refactor. - Postgres.php: move getInsertKeyword/Suffix/PermissionsSuffix from between createDocuments() and getUpsertStatement() down to the getSupportFor* cluster (matches SQL.php:1036 layout). - SQLite.php: move getInsertKeyword override from the top of the class to after the getSupportFor* cluster. - Adapter.php: skipDuplicates() method was inserted between protected property declarations, breaking the property cluster. Move the method down next to withTransaction() — the other callable-taking scope guard. $skipDuplicates property stays with its peers. - Mirror.php: revert the incidental fn () => collapse on the non-skip destination createDocuments call back to the original two-line form, minimizing unrelated diff noise. Pure relocation / style. No behavior change. --- src/Database/Adapter.php | 42 +++++++++++----------- src/Database/Adapter/Postgres.php | 58 +++++++++++++++---------------- src/Database/Adapter/SQLite.php | 10 +++--- src/Database/Mirror.php | 3 +- 4 files changed, 57 insertions(+), 56 deletions(-) diff --git a/src/Database/Adapter.php b/src/Database/Adapter.php index 26fe8249c..1678024ee 100644 --- a/src/Database/Adapter.php +++ b/src/Database/Adapter.php @@ -35,27 +35,6 @@ abstract class Adapter protected bool $skipDuplicates = false; - /** - * Run a callback with skipDuplicates enabled. - * Duplicate key errors during createDocuments() will be silently skipped - * instead of thrown. Nestable — saves and restores previous state. - * - * @template T - * @param callable(): T $callback - * @return T - */ - public function skipDuplicates(callable $callback): mixed - { - $previous = $this->skipDuplicates; - $this->skipDuplicates = true; - - try { - return $callback(); - } finally { - $this->skipDuplicates = $previous; - } - } - /** * @var array */ @@ -415,6 +394,27 @@ public function inTransaction(): bool return $this->inTransaction > 0; } + /** + * Run a callback with skipDuplicates enabled. + * Duplicate key errors during createDocuments() will be silently skipped + * instead of thrown. Nestable — saves and restores previous state. + * + * @template T + * @param callable(): T $callback + * @return T + */ + public function skipDuplicates(callable $callback): mixed + { + $previous = $this->skipDuplicates; + $this->skipDuplicates = true; + + try { + return $callback(); + } finally { + $this->skipDuplicates = $previous; + } + } + /** * @template T * @param callable(): T $callback diff --git a/src/Database/Adapter/Postgres.php b/src/Database/Adapter/Postgres.php index 03036dd2f..2c27e08e7 100644 --- a/src/Database/Adapter/Postgres.php +++ b/src/Database/Adapter/Postgres.php @@ -1365,35 +1365,6 @@ public function updateDocument(Document $collection, string $id, Document $docum return $document; } - protected function getInsertKeyword(): string - { - return 'INSERT INTO'; - } - - protected function getInsertSuffix(string $table): string - { - if (!$this->skipDuplicates) { - return ''; - } - - $conflictTarget = $this->sharedTables ? '("_uid", "_tenant")' : '("_uid")'; - - return "ON CONFLICT {$conflictTarget} DO NOTHING"; - } - - protected function getInsertPermissionsSuffix(): string - { - if (!$this->skipDuplicates) { - return ''; - } - - $conflictTarget = $this->sharedTables - ? '("_type", "_permission", "_document", "_tenant")' - : '("_type", "_permission", "_document")'; - - return "ON CONFLICT {$conflictTarget} DO NOTHING"; - } - /** * @param string $tableName * @param string $columns @@ -2379,6 +2350,35 @@ public function getSupportForOptionalSpatialAttributeWithExistingRows(): bool return false; } + protected function getInsertKeyword(): string + { + return 'INSERT INTO'; + } + + protected function getInsertSuffix(string $table): string + { + if (!$this->skipDuplicates) { + return ''; + } + + $conflictTarget = $this->sharedTables ? '("_uid", "_tenant")' : '("_uid")'; + + return "ON CONFLICT {$conflictTarget} DO NOTHING"; + } + + protected function getInsertPermissionsSuffix(): string + { + if (!$this->skipDuplicates) { + return ''; + } + + $conflictTarget = $this->sharedTables + ? '("_type", "_permission", "_document", "_tenant")' + : '("_type", "_permission", "_document")'; + + return "ON CONFLICT {$conflictTarget} DO NOTHING"; + } + public function decodePoint(string $wkb): array { if (str_starts_with(strtoupper($wkb), 'POINT(')) { diff --git a/src/Database/Adapter/SQLite.php b/src/Database/Adapter/SQLite.php index 2732f784b..33f370775 100644 --- a/src/Database/Adapter/SQLite.php +++ b/src/Database/Adapter/SQLite.php @@ -34,11 +34,6 @@ */ class SQLite extends MariaDB { - protected function getInsertKeyword(): string - { - return $this->skipDuplicates ? 'INSERT OR IGNORE INTO' : 'INSERT INTO'; - } - /** * @inheritDoc */ @@ -1941,4 +1936,9 @@ public function getSupportForTTLIndexes(): bool { return false; } + + protected function getInsertKeyword(): string + { + return $this->skipDuplicates ? 'INSERT OR IGNORE INTO' : 'INSERT INTO'; + } } diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index a4ce9c9d7..d3ecfb347 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -729,7 +729,8 @@ public function createDocuments( } $this->destination->withPreserveDates( - fn () => $this->destination->createDocuments( + fn () => + $this->destination->createDocuments( $collection, $clones, $batchSize, From 3c850134b2eb7a0469d2e144788884564d448bc8 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Wed, 15 Apr 2026 07:29:42 +0100 Subject: [PATCH 22/25] Mirror::createDocuments: re-fetch source after skipDuplicates insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Greptile #3084293974. The captureOnNext-based forwarding was structurally broken post-pre-filter-removal: SQL adapters return the full input batch from createDocuments regardless of how many rows INSERT IGNORE actually inserted, so onNext fired for every document including skipped duplicates. captureOnNext buffered all of them and flushed the full buffer (carrying the caller's would-be values) to destination, diverging destination from source whenever source had a pre-existing row the caller tried to overwrite. Fix: instead of trying to distinguish inserted vs skipped at insert time, re-fetch source's authoritative state via find() after the adapter write settles, then forward that to destination. Race-free regardless of concurrent writers — destination always receives a faithful snapshot of whatever source holds, because the query runs after source's write has resolved. Cost: one extra SELECT per batch against source when skipDuplicates is active AND Mirror has an upgraded destination. Zero cost on the non-skip path. Memory remains O(batch_size). Net ~47 lines smaller than the previous capture-based approach: - captureOnNext closure removed - flushBuffer closure removed - bounded-buffer machinery removed - shouldMirror upfront-snapshot removed (inlined as eligibility check) Test: testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination now asserts destination holds source's 'Original' value rather than the caller's 'WouldBe' input, per Greptile's suggested assertion. --- src/Database/Mirror.php | 140 +++++++++++++++---------------- tests/e2e/Adapter/MirrorTest.php | 6 ++ 2 files changed, 74 insertions(+), 72 deletions(-) diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index d3ecfb347..6f043accf 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,95 +601,91 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { - // skipDuplicates: forward only what the source persisted; flush-on-fill keeps memory O($batchSize). if ($this->skipDuplicates) { - // Resolve destination eligibility upfront so the flush closure can short-circuit. - $shouldMirror = !\in_array($collection, self::SOURCE_ONLY_COLLECTIONS) - && $this->destination !== null; + $modified = $this->source->skipDuplicates( + fn () => $this->source->createDocuments( + $collection, + $documents, + $batchSize, + $onNext, + $onError, + ) + ); - if ($shouldMirror) { - $upgrade = $this->silent(fn () => $this->getUpgradeStatus($collection)); - $shouldMirror = $upgrade !== null && $upgrade->getAttribute('status', '') === 'upgraded'; + if ( + \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) + || $this->destination === null + ) { + return $modified; } - /** @var array $buffer */ - $buffer = []; + $upgrade = $this->silent(fn () => $this->getUpgradeStatus($collection)); + if ($upgrade === null || $upgrade->getAttribute('status', '') !== 'upgraded') { + return $modified; + } - $flushBuffer = function () use (&$buffer, $collection, $batchSize, $shouldMirror): void { - if (!$shouldMirror || empty($buffer)) { - $buffer = []; - return; + try { + // Adapter-level INSERT IGNORE does not report per-row outcomes, so + // forwarding the caller's input would diverge on source-skipped duplicates. + // Re-fetch source's authoritative state after its write settles and + // forward that — race-free regardless of concurrent writers. + $ids = \array_values(\array_filter( + \array_map(fn (Document $d) => $d->getId(), $documents), + fn ($id) => $id !== '' + )); + + if (empty($ids)) { + return $modified; } - try { - $clones = []; - foreach ($buffer as $document) { - $clone = clone $document; - foreach ($this->writeFilters as $filter) { - $clone = $filter->beforeCreateDocument( - source: $this->source, - destination: $this->destination, - collectionId: $collection, - document: $clone, - ); - } - $clones[] = $clone; - } - - $this->destination->skipDuplicates( - fn () => $this->destination->withPreserveDates( - fn () => $this->destination->createDocuments( - $collection, - $clones, - $batchSize, - ) - ) - ); + $authoritative = $this->source->silent( + fn () => $this->source->find($collection, [ + Query::equal('$id', $ids), + Query::limit(\count($ids)), + ]) + ); - foreach ($clones as $clone) { - foreach ($this->writeFilters as $filter) { - $filter->afterCreateDocument( - source: $this->source, - destination: $this->destination, - collectionId: $collection, - document: $clone, - ); - } + $clones = []; + foreach ($authoritative as $document) { + $clone = clone $document; + foreach ($this->writeFilters as $filter) { + $clone = $filter->beforeCreateDocument( + source: $this->source, + destination: $this->destination, + collectionId: $collection, + document: $clone, + ); } - } catch (\Throwable $err) { - $this->logError('createDocuments', $err); + $clones[] = $clone; } - $buffer = []; - }; + $this->destination->skipDuplicates( + fn () => $this->destination->withPreserveDates( + fn () => $this->destination->createDocuments( + $collection, + $clones, + $batchSize, + ) + ) + ); - $captureOnNext = function (Document $doc) use (&$buffer, &$flushBuffer, $batchSize, $onNext): void { - if ($onNext !== null) { - $onNext($doc); - } - $buffer[] = $doc; - if (\count($buffer) >= $batchSize) { - $flushBuffer(); + foreach ($clones as $clone) { + foreach ($this->writeFilters as $filter) { + $filter->afterCreateDocument( + source: $this->source, + destination: $this->destination, + collectionId: $collection, + document: $clone, + ); + } } - }; - - $modified = $this->source->skipDuplicates( - fn () => $this->source->createDocuments( - $collection, - $documents, - $batchSize, - $captureOnNext, - $onError, - ) - ); - - // Flush any tail (< $batchSize) that didn't trigger a flush during the source call. - $flushBuffer(); + } catch (\Throwable $err) { + $this->logError('createDocuments', $err); + } return $modified; } - // Non-skip path: pass through directly, forward original input to destination. $modified = $this->source->createDocuments( $collection, $documents, diff --git a/tests/e2e/Adapter/MirrorTest.php b/tests/e2e/Adapter/MirrorTest.php index b8cc91a48..f4efe1b29 100644 --- a/tests/e2e/Adapter/MirrorTest.php +++ b/tests/e2e/Adapter/MirrorTest.php @@ -378,6 +378,12 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getSource()->getDocument($collection, 'fresh')->getAttribute('name') ); + // Destination must hold source's authoritative value, not the WouldBe input. + $this->assertSame( + 'Original', + $database->getDestination()->getDocument($collection, 'dup')->getAttribute('name'), + 'Destination must reflect source authoritative state, not caller input' + ); $this->assertSame( 'Fresh', $database->getDestination()->getDocument($collection, 'fresh')->getAttribute('name') From 934ec0483c8381c143a6742b02ade69e5bab57ba Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Wed, 15 Apr 2026 07:57:20 +0100 Subject: [PATCH 23/25] Mirror::createDocuments: pre-filter existing ids + unify skip/non-skip paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Greptile #3084293974. The captureOnNext-based forwarding was structurally broken: SQL adapters return the full input batch from createDocuments regardless of what INSERT IGNORE actually inserted, so onNext fired for every doc including skipped duplicates. captureOnNext then forwarded the caller's would-be values to destination, diverging destination from source whenever source had a pre-existing row the caller tried to re-insert. Fix: pre-filter against source before the insert to identify which input ids already exist, then forward only the genuinely-new docs to destination. A source-skipped duplicate is a no-op (no user write happened), so nothing should propagate. This matches Greptile's semantics: destination.dup.isEmpty() after skipping a duplicate. While the logic was touched, the skip and non-skip paths (which shared eligibility check, upgrade check, clone loop, destination write, and after-filter loop) are unified into a single flow with $this-> skipDuplicates branches only at the points where behavior actually differs: - source call: wrapped in source->skipDuplicates when active - $toForward: filter out pre-existing ids when active - destination call: wrapped in destination->skipDuplicates when active Non-skip hot path pays zero new closure allocations; the pre-filter block is gated behind if ($this->skipDuplicates) and short-circuits to an empty $existingIds map. Net: -46 lines in createDocuments (~170 → ~95), same test coverage, Greptile's assertion now holds. --- src/Database/Mirror.php | 140 +++++++++++-------------------- tests/e2e/Adapter/MirrorTest.php | 10 +-- 2 files changed, 52 insertions(+), 98 deletions(-) diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 6f043accf..54623961d 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -601,98 +601,34 @@ public function createDocuments( ?callable $onNext = null, ?callable $onError = null, ): int { + // In skipDuplicates mode, identify which input ids already exist on source. + // These will be silently no-oped by the adapter's INSERT IGNORE and must + // not propagate to destination — a skipped duplicate is not a user write. + $existingIds = []; if ($this->skipDuplicates) { - $modified = $this->source->skipDuplicates( - fn () => $this->source->createDocuments( - $collection, - $documents, - $batchSize, - $onNext, - $onError, - ) - ); + $ids = \array_values(\array_filter( + \array_map(fn (Document $d) => $d->getId(), $documents), + fn ($id) => $id !== '' + )); - if ( - \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) - || $this->destination === null - ) { - return $modified; - } - - $upgrade = $this->silent(fn () => $this->getUpgradeStatus($collection)); - if ($upgrade === null || $upgrade->getAttribute('status', '') !== 'upgraded') { - return $modified; - } - - try { - // Adapter-level INSERT IGNORE does not report per-row outcomes, so - // forwarding the caller's input would diverge on source-skipped duplicates. - // Re-fetch source's authoritative state after its write settles and - // forward that — race-free regardless of concurrent writers. - $ids = \array_values(\array_filter( - \array_map(fn (Document $d) => $d->getId(), $documents), - fn ($id) => $id !== '' - )); - - if (empty($ids)) { - return $modified; - } - - $authoritative = $this->source->silent( + if (!empty($ids)) { + $existing = $this->source->silent( fn () => $this->source->find($collection, [ Query::equal('$id', $ids), Query::limit(\count($ids)), ]) ); - - $clones = []; - foreach ($authoritative as $document) { - $clone = clone $document; - foreach ($this->writeFilters as $filter) { - $clone = $filter->beforeCreateDocument( - source: $this->source, - destination: $this->destination, - collectionId: $collection, - document: $clone, - ); - } - $clones[] = $clone; + foreach ($existing as $doc) { + $existingIds[$doc->getId()] = true; } - - $this->destination->skipDuplicates( - fn () => $this->destination->withPreserveDates( - fn () => $this->destination->createDocuments( - $collection, - $clones, - $batchSize, - ) - ) - ); - - foreach ($clones as $clone) { - foreach ($this->writeFilters as $filter) { - $filter->afterCreateDocument( - source: $this->source, - destination: $this->destination, - collectionId: $collection, - document: $clone, - ); - } - } - } catch (\Throwable $err) { - $this->logError('createDocuments', $err); } - - return $modified; } - $modified = $this->source->createDocuments( - $collection, - $documents, - $batchSize, - $onNext, - $onError, - ); + $modified = $this->skipDuplicates + ? $this->source->skipDuplicates( + fn () => $this->source->createDocuments($collection, $documents, $batchSize, $onNext, $onError) + ) + : $this->source->createDocuments($collection, $documents, $batchSize, $onNext, $onError); if ( \in_array($collection, self::SOURCE_ONLY_COLLECTIONS) @@ -706,12 +642,23 @@ public function createDocuments( return $modified; } + // In skipDuplicates mode, drop pre-existing ids so their no-op writes + // don't propagate. Non-skip mode forwards everything as before. + $toForward = $this->skipDuplicates + ? \array_values(\array_filter( + $documents, + fn (Document $d) => $d->getId() === '' || !isset($existingIds[$d->getId()]) + )) + : $documents; + + if (empty($toForward)) { + return $modified; + } + try { $clones = []; - - foreach ($documents as $document) { + foreach ($toForward as $document) { $clone = clone $document; - foreach ($this->writeFilters as $filter) { $clone = $filter->beforeCreateDocument( source: $this->source, @@ -720,18 +667,25 @@ public function createDocuments( document: $clone, ); } - $clones[] = $clone; } - $this->destination->withPreserveDates( - fn () => - $this->destination->createDocuments( - $collection, - $clones, - $batchSize, - ) - ); + if ($this->skipDuplicates) { + $this->destination->skipDuplicates( + fn () => $this->destination->withPreserveDates( + fn () => $this->destination->createDocuments($collection, $clones, $batchSize) + ) + ); + } else { + $this->destination->withPreserveDates( + fn () => + $this->destination->createDocuments( + $collection, + $clones, + $batchSize, + ) + ); + } foreach ($clones as $clone) { foreach ($this->writeFilters as $filter) { diff --git a/tests/e2e/Adapter/MirrorTest.php b/tests/e2e/Adapter/MirrorTest.php index f4efe1b29..c513ddfcf 100644 --- a/tests/e2e/Adapter/MirrorTest.php +++ b/tests/e2e/Adapter/MirrorTest.php @@ -378,11 +378,11 @@ public function testCreateDocumentsSkipDuplicatesDoesNotDivergeDestination(): vo $database->getSource()->getDocument($collection, 'fresh')->getAttribute('name') ); - // Destination must hold source's authoritative value, not the WouldBe input. - $this->assertSame( - 'Original', - $database->getDestination()->getDocument($collection, 'dup')->getAttribute('name'), - 'Destination must reflect source authoritative state, not caller input' + // A source-skipped duplicate is a no-op and must not propagate to + // destination. Only the genuinely-inserted 'fresh' row should mirror. + $this->assertTrue( + $database->getDestination()->getDocument($collection, 'dup')->isEmpty(), + 'Source-skipped doc must not be inserted into destination' ); $this->assertSame( 'Fresh', From fa0e3739df43675d853707ee21508346f96e7503 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Wed, 15 Apr 2026 11:23:25 +0100 Subject: [PATCH 24/25] Chunk id-lookup queries to respect RELATION_QUERY_CHUNK_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit #3084994641 and #3084994657 (+ Greptile #3084466265). Three call sites were passing unchunked id arrays into Query::equal('$id', ...), which throws QueryException from DocumentsValidator once the array exceeds 5000 values. Mirror::createDocuments (skipDuplicates pre-filter): defensive — Mirror's public API accepts arbitrary-size input, and while current callers pass batches below the limit, the code shouldn't throw on a theoretical direct caller with >5000 input ids. Database::upsertDocumentsWithIncrease (both tenantPerDocument and single- tenant branches): real regression. The batched pre-read was introduced in ae929dbc to restore per-tenant grouping for StatsUsage. It replaced a per-doc getDocument() loop on main (which is safe from validator limits because each call queries one id). Passing the full input array to Query::equal now breaks at >5000 input docs — a path that previously worked slowly on main. All three sites use self::RELATION_QUERY_CHUNK_SIZE (5000) and Query::limit(PHP_INT_MAX), matching the existing pattern used by the five populateDocumentsRelationshipsBatch chunking sites. Accumulator shape is adapted to our map-only use case: we build the existence set / lookup map directly inside the chunk loop instead of carrying a flat list we don't need. --- src/Database/Database.php | 36 ++++++++++++++++++++---------------- src/Database/Mirror.php | 18 ++++++++++-------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index 62681793d..f250cd3d8 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -7159,14 +7159,16 @@ public function upsertDocumentsWithIncrease( } foreach ($idsByTenant as $tenant => $tenantIds) { $tenantIds = \array_values(\array_unique($tenantIds)); - $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $tenantIds), - Query::limit(\count($tenantIds)), - ]) - ))); - foreach ($found as $doc) { - $existingDocs[$tenant . ':' . $doc->getId()] = $doc; + foreach (\array_chunk($tenantIds, self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $chunk), + Query::limit(PHP_INT_MAX), + ]) + ))); + foreach ($found as $doc) { + $existingDocs[$tenant . ':' . $doc->getId()] = $doc; + } } } } else { @@ -7176,14 +7178,16 @@ public function upsertDocumentsWithIncrease( ))); if (!empty($docIds)) { - $existing = $this->authorization->skip(fn () => $this->silent( - fn () => $this->find($collection->getId(), [ - Query::equal('$id', $docIds), - Query::limit(\count($docIds)), - ]) - )); - foreach ($existing as $doc) { - $existingDocs[$this->tenantKey($doc)] = $doc; + foreach (\array_chunk($docIds, self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + $existing = $this->authorization->skip(fn () => $this->silent( + fn () => $this->find($collection->getId(), [ + Query::equal('$id', $chunk), + Query::limit(PHP_INT_MAX), + ]) + )); + foreach ($existing as $doc) { + $existingDocs[$this->tenantKey($doc)] = $doc; + } } } } diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 54623961d..70c353e4d 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -612,14 +612,16 @@ public function createDocuments( )); if (!empty($ids)) { - $existing = $this->source->silent( - fn () => $this->source->find($collection, [ - Query::equal('$id', $ids), - Query::limit(\count($ids)), - ]) - ); - foreach ($existing as $doc) { - $existingIds[$doc->getId()] = true; + foreach (\array_chunk(\array_unique($ids), self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + $existing = $this->source->silent( + fn () => $this->source->find($collection, [ + Query::equal('$id', $chunk), + Query::limit(PHP_INT_MAX), + ]) + ); + foreach ($existing as $doc) { + $existingIds[$doc->getId()] = true; + } } } } From 52b189bded7ef409bb978483a7231d779051b510 Mon Sep 17 00:00:00 2001 From: Prem Palanisamy Date: Wed, 15 Apr 2026 11:53:53 +0100 Subject: [PATCH 25/25] Chunk id lookups by \$this->maxQueryValues, not RELATION_QUERY_CHUNK_SIZE CodeRabbit #3084994641 + #3084994657 follow-up: the earlier chunking fix used self::RELATION_QUERY_CHUNK_SIZE (5000), which matches the existing populateDocumentsRelationshipsBatch pattern but doesn't respect the user-configurable validator ceiling. Query::equal('\$id', \$chunk) is validated by DocumentsValidator against \$this->maxQueryValues, not RELATION_QUERY_CHUNK_SIZE. A caller doing \$db->setMaxQueryValues(100) followed by any of the three chunked pre-reads would still throw QueryException because chunks of 5000 exceed the configured 100. Switch to array_chunk(..., max(1, \$this->maxQueryValues)) at all three sites so chunk size always respects the actual validator limit. In the default case (maxQueryValues = 5000) this produces identical chunks to the old code; under customization it correctly scales down. Note: the 5 existing relationship-population chunking sites still use RELATION_QUERY_CHUNK_SIZE and have the same latent bug under customized maxQueryValues. Leaving that for a follow-up since fixing the pattern globally is out of scope for this PR. --- src/Database/Database.php | 4 ++-- src/Database/Mirror.php | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Database/Database.php b/src/Database/Database.php index f250cd3d8..f4dcede34 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -7159,7 +7159,7 @@ public function upsertDocumentsWithIncrease( } foreach ($idsByTenant as $tenant => $tenantIds) { $tenantIds = \array_values(\array_unique($tenantIds)); - foreach (\array_chunk($tenantIds, self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + foreach (\array_chunk($tenantIds, \max(1, $this->maxQueryValues)) as $chunk) { $found = $this->authorization->skip(fn () => $this->withTenant($tenant, fn () => $this->silent( fn () => $this->find($collection->getId(), [ Query::equal('$id', $chunk), @@ -7178,7 +7178,7 @@ public function upsertDocumentsWithIncrease( ))); if (!empty($docIds)) { - foreach (\array_chunk($docIds, self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + foreach (\array_chunk($docIds, \max(1, $this->maxQueryValues)) as $chunk) { $existing = $this->authorization->skip(fn () => $this->silent( fn () => $this->find($collection->getId(), [ Query::equal('$id', $chunk), diff --git a/src/Database/Mirror.php b/src/Database/Mirror.php index 70c353e4d..2f799eb88 100644 --- a/src/Database/Mirror.php +++ b/src/Database/Mirror.php @@ -612,7 +612,7 @@ public function createDocuments( )); if (!empty($ids)) { - foreach (\array_chunk(\array_unique($ids), self::RELATION_QUERY_CHUNK_SIZE) as $chunk) { + foreach (\array_chunk(\array_unique($ids), \max(1, $this->maxQueryValues)) as $chunk) { $existing = $this->source->silent( fn () => $this->source->find($collection, [ Query::equal('$id', $chunk),