diff --git a/PR_BODY.md b/PR_BODY.md new file mode 100644 index 000000000..438c1a658 --- /dev/null +++ b/PR_BODY.md @@ -0,0 +1,35 @@ +## Problem + +`getDocument()` uses a look-aside cache that is invalidated by **purging**. The purge is **not atomic with the committing transaction**, so under concurrency a reader that missed the cache can read the pre-commit row and re-`save` it into the cache **after** the writer's purge — leaving a stale entry that subsequent reads serve until TTL. + +The existing `forUpdate` bypass (added in `ForUpdateCacheTest`) only protects the **locking read inside `updateDocument`**. A normal `getDocument()` still serves the stale snapshot — `testForUpdateReadBypassesStaleCache` even asserts this. That remaining read-path window is the root cause of **flaky project-config E2E tests**: `ProjectsConsoleClientTest::testGetProject` issues ~12 rapid read-modify-write PATCHes against a single project document and intermittently reads back old `auths` / `services` values. + +## Evidence + +Runtime trace under Swoole (one coroutine per request, `upd` = `$updatedAt`): + +``` +[20] SAVE upd=51.030 coro 20 caches the row it read at request-init +[20] WRITE upd=51.085 coro 20 commits a NEWER version +[20] PURGE in-transaction purge +[20] PURGE post-commit purge +[21] HIT upd=51.030 coro 21 served STALE 51.030 from cache +``` + +In a real cloud E2E environment, `testGetProject` failed ~19 of 20 runs at varying assertions before the fix, and **20/20 after**. The failures were field-agnostic (`authPasswordDictionary`, `authPersonalDataCheck`, `authSessionsLimit` reverting to its default `0`, service flags, list sizes) — consistent with whichever write lost the cache race on a given run. + +## Fix + +- On every committed `updateDocument` / `deleteDocument`, record the committed `$updatedAt` in a **sibling cache key** (`:__ver`) that `purgeCachedDocument()` does **not** delete, so it outlives the purge of the document body. +- On read, discard any cached snapshot whose `$updatedAt` predates that marker and reload from the adapter. +- The guard is **skipped** when no marker exists or the timestamp can't be parsed, so behaviour is unchanged outside the race. `cacheVersionStamp()` normalises `string` / `DateTimeInterface` / Mongo `UTCDateTime` values to a comparable microsecond timestamp. + +## Tests + +- New deterministic regression test `ForUpdateCacheTest::testReadRejectsStaleCacheSnapshotReCachedAfterUpdate` — **fails without the fix** (`'stale'`), passes with it. It reproduces the race at the cache layer (re-`save` of a pre-commit snapshot after a committed update) without needing real concurrency. +- Full unit suite: **367 tests, 2160 assertions — OK**. Pint PSR-12 ✅, PHPStan level 7 ✅. + +## Notes / open questions + +- The `:__ver` marker relies on `$updatedAt` advancing per write; with `preserveDates` enabled or sub-microsecond updates two versions could tie (comparison is strict `<`, so a tie keeps today's behaviour). Could switch to a monotonic counter if preferred. +- Marker reads use `self::TTL`, so the marker expires in lockstep with the document cache entry it guards. \ No newline at end of file diff --git a/src/Database/Database.php b/src/Database/Database.php index 5a6cd97d7..b9b4b3eeb 100644 --- a/src/Database/Database.php +++ b/src/Database/Database.php @@ -174,6 +174,10 @@ class Database // Cache public const TTL = 60 * 60 * 24; // 24 hours + // Suffix for the per-document cache key that records the latest committed + // version, used to reject stale snapshots re-cached by racing readers. + public const CACHE_VERSION_SUFFIX = '__ver'; + // Events public const EVENT_ALL = '*'; @@ -4852,6 +4856,28 @@ public function getDocument(string $collection, string $id, array $queries = [], } } + // Reject a cached snapshot that predates the latest committed write. Cache + // invalidation is a purge, which is not atomic with the committing + // transaction: a concurrent reader can re-cache the pre-commit row after a + // writer's purge, leaving a stale entry that later reads would serve. Each + // write records the committed $updatedAt in a sibling marker key; discard + // any cached copy older than it so the stale entry cannot be served. + if ($cached !== null && isset($cached['$updatedAt'])) { + $marker = null; + try { + $marker = $this->cache->load($documentKey . ':' . self::CACHE_VERSION_SUFFIX, self::TTL); + } catch (Exception $e) { + Console::warning('Warning: Failed to get cache version marker: ' . $e->getMessage()); + } + + if (\is_string($marker) && $marker !== '') { + $cachedVersion = $this->cacheVersionStamp($cached['$updatedAt']); + if ($cachedVersion !== null && (float) $cachedVersion < (float) $marker) { + $cached = null; + } + } + } + if ($cached) { $document = $this->createDocumentInstance($collection->getId(), $cached); @@ -6386,6 +6412,9 @@ public function updateDocument(string $collection, string $id, Document $documen // Purge again after commit so readers cannot re-cache the pre-commit version $this->purgeCachedDocumentInternal($collection->getId(), $id); + // Record the committed version so a stale snapshot re-cached by a racing + // reader is rejected on subsequent reads (see getDocument). + $this->recordCachedDocumentVersion($collection->getId(), $id, $document->getUpdatedAt()); if (!$this->inBatchRelationshipPopulation && $this->resolveRelationships) { $documents = $this->silent(fn () => $this->populateDocumentsRelationships([$document], $collection, $this->relationshipFetchDepth)); @@ -7775,6 +7804,9 @@ public function deleteDocument(string $collection, string $id): bool if ($deleted) { // Purge again after commit so readers cannot re-cache the pre-commit version $this->purgeCachedDocumentInternal($collection->getId(), $id); + // Advance the version marker past any snapshot so a stale "exists" copy + // re-cached by a racing reader is rejected on subsequent reads. + $this->recordCachedDocumentVersion($collection->getId(), $id, DateTime::now()); $this->trigger(self::EVENT_DOCUMENT_DELETE, $document); } @@ -8380,6 +8412,70 @@ protected function purgeCachedDocumentInternal(string $collectionId, ?string $id return true; } + /** + * Record the latest committed version of a document in the cache. + * + * Stored in a sibling key (CACHE_VERSION_SUFFIX) that purgeCachedDocument() + * does not delete, so it outlives the purge of the document body. getDocument() + * compares a cached snapshot's $updatedAt against this marker and discards any + * snapshot older than it — closing the window where a reader re-caches a + * pre-commit row after a writer's purge. + * + * @param string $collectionId + * @param string $id + * @param string|\Utopia\Database\DateTime|null $updatedAt committed $updatedAt + * @return void + */ + protected function recordCachedDocumentVersion(string $collectionId, string $id, mixed $updatedAt): void + { + $stamp = $this->cacheVersionStamp($updatedAt); + if ($stamp === null) { + return; + } + + [, $documentKey] = $this->getCacheKeys($collectionId, $id); + + try { + $this->cache->save($documentKey . ':' . self::CACHE_VERSION_SUFFIX, $stamp); + } catch (Exception $e) { + Console::warning('Warning: Failed to record cache version marker: ' . $e->getMessage()); + } + } + + /** + * Normalise a document $updatedAt value (string, DateTime, or Mongo UTCDateTime) + * into a comparable microsecond-precision UNIX timestamp string, or null when + * it cannot be determined (in which case the version guard is skipped). + * + * @param mixed $updatedAt + * @return string|null + */ + private function cacheVersionStamp(mixed $updatedAt): ?string + { + if (empty($updatedAt)) { + return null; + } + + try { + if ($updatedAt instanceof \DateTimeInterface) { + return $updatedAt->format('U.u'); + } + + // MongoDB\BSON\UTCDateTime and similar value objects. + if (\is_object($updatedAt) && \method_exists($updatedAt, 'toDateTime')) { + return $updatedAt->toDateTime()->format('U.u'); + } + + if (\is_string($updatedAt)) { + return (new \DateTime($updatedAt))->format('U.u'); + } + } catch (\Throwable $e) { + return null; + } + + return null; + } + /** * Cleans a specific document from cache and triggers EVENT_DOCUMENT_PURGE. * And related document reference in the collection cache. diff --git a/tests/unit/ForUpdateCacheTest.php b/tests/unit/ForUpdateCacheTest.php index aef2736b3..ac945da24 100644 --- a/tests/unit/ForUpdateCacheTest.php +++ b/tests/unit/ForUpdateCacheTest.php @@ -17,10 +17,13 @@ class ForUpdateCacheTest extends TestCase private Database $database; + private Cache $cache; + protected function setUp(): void { $this->adapter = new DatabaseMemory(); - $this->database = new Database($this->adapter, new Cache(new CacheMemory())); + $this->cache = new Cache(new CacheMemory()); + $this->database = new Database($this->adapter, $this->cache); $this->database ->setDatabase('utopiaTests') ->setNamespace('for_update_' . \uniqid()); @@ -101,4 +104,31 @@ public function testUpdateDocumentDoesNotResurrectStaleCachedAttributes(): void $this->assertSame('fresh', $document->getAttribute('name')); $this->assertSame('updated', $document->getAttribute('description')); } + + public function testReadRejectsStaleCacheSnapshotReCachedAfterUpdate(): void + { + // Prime the cache with the original snapshot. + $original = $this->database->getDocument('projects', 'project'); + $this->assertSame('stale', $original->getAttribute('name')); + + // Commit a newer version through Database: this purges the cache and + // records the committed $updatedAt as the version marker. usleep guarantees + // a strictly later $updatedAt than the snapshot captured above. + \usleep(2000); + $this->database->updateDocument('projects', 'project', new Document([ + 'name' => 'fresh', + ])); + + // Simulate a concurrent reader whose pre-commit read landed in the cache + // after the writer's purge — the exact race the version marker defends + // against. The resurrected snapshot carries the OLD $updatedAt. + [, $documentKey, $hashKey] = $this->database->getCacheKeys('projects', 'project'); + $this->cache->save($documentKey, $original->getArrayCopy(), $hashKey); + + // A normal (non-locking) read must not serve the resurrected stale snapshot: + // its $updatedAt is older than the recorded version marker, so it is rejected + // and the row is reloaded from the adapter. + $served = $this->database->getDocument('projects', 'project'); + $this->assertSame('fresh', $served->getAttribute('name')); + } }