hazedb/exec_select.go at main · VeloxCoding/hazedb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package hazedb

import (
	"slices"
	"sort"
)

// execSelectPK reads the single PK-matched row, projected (or the whole row for
// SELECT *). Returns no rows for LIMIT 0, any OFFSET, or a NULL key.
func (db *DB) execSelectPK(pl *plan, keyVal Value) ([]string, []Row, error) {
	if keyVal.IsNull() {
		return pl.colNames, nil, nil
	}
	pk, err := coerceToUUID(keyVal)
	if err != nil {
		return nil, nil, err
	}
	return db.execSelectPKResolved(pl, pk)
}

// execSelectPKResolved serves a PK SELECT once the key is a resolved UUID — the
// shared core of execSelectPK and the direct-UUID read fast path (which skips the
// Value round-trip + coerce).
func (db *DB) execSelectPKResolved(pl *plan, pk UUID) ([]string, []Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames
	// A PK match is at most one row, so LIMIT 0 or any OFFSET drops it.
	if st.limit == 0 || st.offset > 0 {
		return colNames, nil, nil
	}
	if st.starAll {
		r, ok := tbl.getByPK(pk)
		if !ok {
			return colNames, nil, nil
		}
		return colNames, []Row{r}, nil
	}
	pr, ok := tbl.getByPKProject(pk, pl.projOrdinals)
	if !ok {
		return colNames, nil, nil
	}
	return colNames, []Row{pr}, nil
}

// execSelectPKOne is execSelectPK for QueryRow: it returns the single matched
// row directly (nil if none), skipping the []Row result slice Query allocates.
func (db *DB) execSelectPKOne(pl *plan, keyVal Value) ([]string, Row, error) {
	if keyVal.IsNull() {
		return pl.colNames, nil, nil
	}
	pk, err := coerceToUUID(keyVal)
	if err != nil {
		return nil, nil, err
	}
	return db.execSelectPKOneResolved(pl, pk)
}

// execSelectPKOneResolved serves a single-row PK SELECT once the key is a resolved
// UUID — the shared core of execSelectPKOne and the QueryRow direct-UUID fast path
// (which skips the Value round-trip + coerce), mirroring execSelectPKResolved.
func (db *DB) execSelectPKOneResolved(pl *plan, pk UUID) ([]string, Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames
	// A PK match is at most one row, so LIMIT 0 or any OFFSET drops it.
	if st.limit == 0 || st.offset > 0 {
		return colNames, nil, nil
	}
	if st.starAll {
		r, _ := tbl.getByPK(pk)
		return colNames, r, nil
	}
	pr, _ := tbl.getByPKProject(pk, pl.projOrdinals)
	return colNames, pr, nil
}

// execSelectPKResidual serves a SELECT whose WHERE pins the PK by equality inside
// an AND-chain (pl.pkProbe): fetch the one PK-addressed row, then return it only
// if the FULL WHERE matches, so residual conjuncts (WHERE id = ? AND age = ?) are
// honoured. A PK match is at most one row, so LIMIT 0 / any OFFSET drops it and
// ORDER BY needs no sort. The row is cloned under the lock by getByPK, so matching
// and projecting it off-lock is consistent.
func (db *DB) execSelectPKResidual(pl *plan, keyVal Value, args []Value) ([]string, []Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames
	if st.limit == 0 || st.offset > 0 {
		return colNames, nil, nil
	}
	if keyVal.IsNull() {
		return colNames, nil, nil
	}
	pk, err := coerceToUUID(keyVal)
	if err != nil {
		return nil, nil, err
	}
	r, ok := tbl.getByPK(pk)
	if !ok {
		return colNames, nil, nil
	}
	match := rowMatcher(st.where, &evalCtx{cols: tbl.def.colByName, args: args})
	if !match(r) {
		return colNames, nil, nil
	}
	if st.starAll {
		return colNames, []Row{r}, nil
	}
	proj := make(Row, len(pl.projOrdinals))
	for i, ord := range pl.projOrdinals {
		proj[i] = r[ord]
	}
	return colNames, []Row{proj}, nil
}

// offerLiveRow offers pk's live row to an ORDER BY top-N heap under the shard
// read lock: pred (the full WHERE) and the order-column compare read the row in
// place, and topN.offer clones it only if it makes the cut. So ORDER BY ... LIMIT
// n over a large filtered set clones ~n rows, not the whole matched set.
func (t *table) offerLiveRow(pk UUID, pred func(Row) bool, top *topN) {
	s := t.shardOf(pk)
	s.mu.RLock()
	if rowID, ok := s.pk.get(pk); ok {
		if r := s.rows[rowID]; r != nil && pred(r) {
			top.offer(r)
		}
	}
	s.mu.RUnlock()
}

// collectLiveRow is offerLiveRow's unbounded form (ORDER BY without LIMIT): under
// the shard read lock it fetches pk's live row, checks pred, and on a pass
// collects its (order key + projection) into top — capturing only those, never a
// full-row clone to narrow again later.
func (t *table) collectLiveRow(pk UUID, pred func(Row) bool, top *topN) {
	s := t.shardOf(pk)
	s.mu.RLock()
	if rowID, ok := s.pk.get(pk); ok {
		if r := s.rows[rowID]; r != nil && pred(r) {
			top.collect(r)
		}
	}
	s.mu.RUnlock()
}

// getMatchProject fetches pk's live row, evaluates pred (the full WHERE) on it
// UNDER the shard read lock, and on a pass returns the projection: the ords
// columns cloned, or the whole row for SELECT *. Folding the WHERE check into the
// lock lets an indexed read skip the full-row clone getByPK takes only to
// evaluate and then project from — it clones just the columns it returns.
// Mirrors offerLiveRow; the secondary-index path is non-partitioned.
func (t *table) getMatchProject(pk UUID, pred func(Row) bool, ords []int, starAll bool) (Row, bool) {
	s := t.shardOf(pk)
	s.mu.RLock()
	defer s.mu.RUnlock()
	rowID, ok := s.pk.get(pk)
	if !ok {
		return nil, false
	}
	r := s.rows[rowID]
	if r == nil || !pred(r) {
		return nil, false
	}
	if starAll {
		return r.Clone(), true
	}
	return projectClone(r, ords), true
}

// getMatchProjectInto is the scan-into form of getMatchProject: it writes the
// projection (or the whole row for SELECT *) into dst, reset to empty first, so a
// projection without BYTES columns makes no allocation. The single-row
// QueryRowByIndex fast path. Non-partitioned.
func (t *table) getMatchProjectInto(pk UUID, pred func(Row) bool, ords []int, starAll bool, dst []Value) ([]Value, bool) {
	return t.appendMatchProject(pk, pred, ords, starAll, dst[:0])
}

// appendMatchProject is getMatchProjectInto without the reset: on a WHERE pass it
// APPENDS the projection (or the whole row for SELECT *) to dst and returns the
// grown slice; on a miss it returns dst unchanged. This lets a multi-row result
// pack every row's cells into one backing buffer — each Row a capped view of its
// own span — so the set costs one allocation instead of one per row. A projection
// without BYTES columns appends no allocation of its own. Non-partitioned.
func (t *table) appendMatchProject(pk UUID, pred func(Row) bool, ords []int, starAll bool, dst []Value) ([]Value, bool) {
	s := t.shardOf(pk)
	s.mu.RLock()
	defer s.mu.RUnlock()
	rowID, ok := s.pk.get(pk)
	if !ok {
		return dst, false
	}
	r := s.rows[rowID]
	if r == nil || (pred != nil && !pred(r)) { // nil pred = caller guarantees the match
		return dst, false
	}
	if starAll {
		return appendRowClone(dst, r), true
	}
	return appendProjectClone(dst, r, ords), true
}

// appendMatchJSON is appendMatchProject's encode-under-lock form: it fetches
// pk's live row, checks pred (the full WHERE) under the shard read lock, and on
// a pass appends the row as a flat JSON object into dst straight from the live
// row — no Row clone. Used by the index → JSON read path; non-partitioned, as
// secondary indexes are. Appends nothing on a miss, so a caller probing several
// index candidates can pass the same dst each time.
func (t *table) appendMatchJSON(pk UUID, pred func(Row) bool, cols []string, ords []int, starAll bool, dst []byte) ([]byte, bool) {
	s := t.shardOf(pk)
	s.mu.RLock()
	defer s.mu.RUnlock()
	rowID, ok := s.pk.get(pk)
	if !ok {
		return dst, false
	}
	r := s.rows[rowID]
	if r == nil || !pred(r) {
		return dst, false
	}
	if starAll {
		return appendRowJSONObject(dst, cols, r), true
	}
	return appendRowJSONObjectProject(dst, cols, r, ords), true
}

// idxCandidateSet is the candidate-PK enumerator for an indexed SELECT: the
// (intersected) index hits UNION the dirty overlay. pks holds the slice
// secIndex.lookup returns (a fresh copy of the bucket — the live bucket must not
// escape the index lock) and dirty holds dirtyPKs()'s freshly-copied overlay, so
// the set owns its slices rather than aliasing index state. Returned by value,
// and emit takes its visitor as an argument rather than being a heap closure that
// captures the slices (which escaped). Shared by execSelectIdx (materialized) and
// selectEach (streaming) so the hybrid index∪dirty correctness has a single
// definition.
type idxCandidateSet struct {
	pks   []UUID // index hits (intersected); already unique within the index
	dirty []UUID // dirty overlay (mutated since the last merge); may overlap pks
}

// capHint is a result-slice capacity proportional to the candidate count,
// capped by limit (the scan stops there) and an absolute ceiling (a huge
// candidate set with no LIMIT must not prealloc pathologically). limit < 0
// means no LIMIT.
func (c idxCandidateSet) capHint(limit int) int {
	n := len(c.pks) + len(c.dirty)
	if limit >= 0 && limit < n {
		n = limit
	}
	if n > 1024 {
		n = 1024
	}
	return n
}

// emit visits each unique candidate PK once, calling fn (true to stop). The
// index buckets are already unique, so a dedup set is built only when the dirty
// overlay is non-empty (a PK may then be in both) — the steady-state read
// (merged, dirty empty) walks the bucket with no per-candidate map.
func (c idxCandidateSet) emit(fn func(pk UUID) bool) {
	if len(c.dirty) == 0 {
		for _, pk := range c.pks {
			if fn(pk) {
				return
			}
		}
		return
	}
	seen := make(map[UUID]struct{}, len(c.pks)+len(c.dirty))
	visit := func(pk UUID) bool {
		if _, dup := seen[pk]; dup {
			return false
		}
		seen[pk] = struct{}{}
		return fn(pk)
	}
	for _, pk := range c.pks {
		if visit(pk) {
			return
		}
	}
	for _, pk := range c.dirty {
		if visit(pk) {
			return
		}
	}
}

// idxCandidates resolves the candidate set for an indexed SELECT. ok=false means
// the result is provably empty (a NULL key, a missing index, or no index hit AND
// no dirty overlay) — the caller returns no rows.
func (db *DB) idxCandidates(pl *plan, ctx *evalCtx) (cand idxCandidateSet, ok bool, err error) {
	tbl := pl.rt
	// Index side: one bucket per indexed equality conjunct, intersected. With
	// two indexes (WHERE name = ? AND city = ?) this shrinks the candidate set
	// to rows matching BOTH before any row is fetched — e.g. the 1000 Peters in
	// Amsterdam, not all 8000 Peters. A NULL key matches nothing.
	var pks []UUID
	for i, ord := range pl.idxCols {
		keyVal, err := evalExpr(pl.idxSrcs[i], ctx)
		if err != nil {
			return idxCandidateSet{}, false, err
		}
		if keyVal.IsNull() {
			return idxCandidateSet{}, false, nil
		}
		si := tbl.indexFor(ord)
		if si == nil {
			return idxCandidateSet{}, false, nil
		}
		bucket := si.lookup(keyOf(keyVal))
		if i == 0 {
			pks = bucket
		} else {
			pks = intersectPKs(pks, bucket)
		}
		if len(pks) == 0 {
			break // index side empty; the dirty overlay below may still match
		}
	}
	// Hybrid candidate set: index hits UNION the dirty PKs (membership uncertain).
	// Every candidate's live row is evaluated against the FULL WHERE by the
	// caller, so neither a stale entry, an unrelated dirty PK, nor an extra
	// conjunct yields a wrong row.
	dirty := tbl.dirtyPKs()
	if len(pks) == 0 && len(dirty) == 0 {
		return idxCandidateSet{}, false, nil
	}
	return idxCandidateSet{pks: pks, dirty: dirty}, true, nil
}

// execSelectIdx runs a SELECT whose WHERE pins one or more secondary-indexed
// columns by equality. It resolves candidate PKs through the index(es)
// (intersecting buckets for an AND of equalities) plus the dirty overlay, then
// evaluates the full WHERE on each live row. With an ORDER BY it gathers all
// matches and sorts before LIMIT (the filtered-list pattern); otherwise it
// projects and stops at LIMIT.
func (db *DB) execSelectIdx(pl *plan, args []Value) ([]string, []Row, error) {
	st := pl.st.(*selectStmt)
	if st.limit == 0 {
		return pl.colNames, nil, nil
	}
	// Point-read fast path: a single indexed equality that fully covers the WHERE
	// (idxExact), no ORDER BY, no dirty overlay, and a single live hit. Fetch and
	// project that one row directly — no eval context, bucket copy, candidate set,
	// or result packing. A multi-hit or any dirty row falls through to the general
	// path (which builds the context lazily below).
	if pl.idxExact && pl.orderOrdinal < 0 && len(pl.idxCols) == 1 && pl.rt.readDirtyCount.Load() == 0 {
		keyVal, err := evalLitOrParamValue(pl.idxSrcs[0], args)
		if err != nil {
			return nil, nil, err
		}
		if keyVal.IsNull() {
			return pl.colNames, nil, nil
		}
		if si := pl.rt.indexFor(pl.idxCols[0]); si != nil {
			pk, found, one := si.lookupOne(keyOf(keyVal))
			if !found {
				return pl.colNames, nil, nil
			}
			if one {
				if st.offset > 0 {
					return pl.colNames, nil, nil // the lone row is dropped by OFFSET
				}
				var pr Row
				var ok bool
				if st.starAll {
					pr, ok = pl.rt.getByPK(pk)
				} else {
					pr, ok = pl.rt.getByPKProject(pk, pl.projOrdinals)
				}
				if !ok {
					return pl.colNames, nil, nil
				}
				return pl.colNames, []Row{pr}, nil
			}
			// found && !one: a multi-hit bucket → fall through to the general path.
		}
	}
	// General path: a multi-hit / multi-column / ORDER BY / dirty-overlay index
	// read. Build the eval context once for candidate resolution + the WHERE check.
	ctx := &evalCtx{cols: pl.rt.def.colByName, args: args}
	cand, ok, err := db.idxCandidates(pl, ctx)
	if err != nil {
		return nil, nil, err
	}
	if !ok {
		return pl.colNames, nil, nil
	}
	return db.execCandidates(pl, ctx, cand)
}

// execCandidates materializes a SELECT from a resolved candidate set (index hits
// ∪ dirty overlay): it evaluates the full WHERE on each candidate's live row and,
// with an ORDER BY, sorts before LIMIT (a top-N heap when LIMITed); otherwise it
// projects and stops at LIMIT. Shared by the single-column index path and the
// composite prefix-lookup path. Callers handle the LIMIT 0 early-out.
func (db *DB) execCandidates(pl *plan, ctx *evalCtx, cand idxCandidateSet) ([]string, []Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames

	// ORDER BY: gather every matching live row, sort by the order column, then
	// LIMIT. The candidate set is the index-narrowed (filtered) subset — the
	// "list view" pattern (WHERE author = ? ORDER BY date LIMIT 20) — so sorting
	// it is cheap. (Sorting a near-whole-table subset is the caller's call; that
	// is no longer a list view.)
	if pl.orderOrdinal >= 0 {
		pred := rowMatcher(st.where, ctx)
		// ORDER BY + LIMIT: a top-N heap clones only ~limit rows, so the cost
		// tracks the LIMIT, not the matched-set size (offer reads/clones the live
		// row under the shard lock). st.limit == 0 already returned above.
		if st.limit >= 0 {
			// Keep offset+limit rows so the offset can be dropped after sorting.
			top := &topN{ord: pl.orderOrdinal, desc: st.orderDesc, capN: fetchBound(st.limit, st.offset), proj: projOrNil(st.starAll, pl.projOrdinals)}
			cand.emit(func(pk UUID) bool {
				tbl.offerLiveRow(pk, pred, top)
				return false
			})
			return colNames, sliceOffsetLimit(top.sorted(), st.offset, st.limit), nil
		}
		// ORDER BY without LIMIT: capture only (order key + projection) per match
		// under the shard lock, then sort by the captured key (OFFSET drops the
		// leading rows). No full-row clone, no second projection pass.
		top := topN{ord: pl.orderOrdinal, desc: st.orderDesc, proj: projOrNil(st.starAll, pl.projOrdinals)}
		cand.emit(func(pk UUID) bool {
			tbl.collectLiveRow(pk, pred, &top)
			return false
		})
		return colNames, sliceOffsetLimit(top.sorted(), st.offset, st.limit), nil
	}

	// No ORDER BY: project and stop once offset+limit rows are collected. Pack
	// every result row's cells into ONE growing backing buffer; each Row is a
	// capped view of its own span (the full-slice expression bounds its capacity),
	// so the set owns its cells with one allocation instead of one per row, while a
	// caller still cannot append past its row into the next. packed is presized
	// from the candidate count × cells-per-row so a within-hint bucket never
	// regrows. A later regrow is still correct: earlier Row views keep pointing at
	// their (still-live) prior backing.
	eff := fetchBound(st.limit, st.offset)
	hint := cand.capHint(eff)
	out := make([]Row, 0, hint)
	packed := make([]Value, 0, hint*len(colNames))
	// A fresh index hit needs no re-check when the WHERE is exactly the indexed
	// equalities (idxExact) and nothing is dirty: the bucket is authoritative and a
	// deleted row is dropped by the fetch. Skipping the matcher also avoids
	// compiling it. With a dirty overlay (possibly stale entries) or a residual
	// conjunct, the full WHERE is re-checked per candidate.
	var pred func(Row) bool
	if !(pl.idxExact && len(cand.dirty) == 0) {
		pred = rowMatcher(st.where, ctx)
	}
	// Dirty-empty fast path: walk the index hits directly, no dedup map and no emit
	// closure (which would escape to the heap). The dirty union needs both.
	if len(cand.dirty) == 0 {
		for _, pk := range cand.pks {
			start := len(packed)
			var ok bool
			packed, ok = tbl.appendMatchProject(pk, pred, pl.projOrdinals, st.starAll, packed)
			if !ok {
				continue
			}
			out = append(out, Row(packed[start:len(packed):len(packed)]))
			if eff >= 0 && len(out) >= eff {
				break
			}
		}
		return colNames, sliceOffsetLimit(out, st.offset, st.limit), nil
	}
	cand.emit(func(pk UUID) bool {
		start := len(packed)
		var ok bool
		packed, ok = tbl.appendMatchProject(pk, pred, pl.projOrdinals, st.starAll, packed)
		if !ok {
			return false
		}
		out = append(out, Row(packed[start:len(packed):len(packed)]))
		return eff >= 0 && len(out) >= eff
	})
	return colNames, sliceOffsetLimit(out, st.offset, st.limit), nil
}

// execSelectIdxOne is the single-row (QueryRow) form of execSelectIdx for a
// SELECT with no ORDER BY: it returns the first candidate whose live row passes
// the full WHERE and stops, skipping the []Row slice + collect machinery the
// multi-row path builds. A dirty PK already covered by the index pass that
// failed the WHERE simply re-fails (no dedup needed for first-match).
func (db *DB) execSelectIdxOne(pl *plan, args []Value) ([]string, Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames
	if st.limit == 0 {
		return colNames, nil, nil
	}
	// Point-read fast path (mirrors execSelectIdx): a single indexed equality that
	// fully covers the WHERE (idxExact) with no dirty overlay and a single live hit
	// → fetch + project that one row directly — no eval context, bucket copy, or
	// per-row matcher. A multi-hit bucket or any dirty row falls through to the
	// general first-match loop below. The caller (queryRowPlanV) guarantees no
	// ORDER BY / OFFSET for this path, so the lone hit is the answer.
	if pl.idxExact && len(pl.idxCols) == 1 && tbl.readDirtyCount.Load() == 0 {
		keyVal, err := evalLitOrParamValue(pl.idxSrcs[0], args)
		if err != nil {
			return nil, nil, err
		}
		if keyVal.IsNull() {
			return colNames, nil, nil
		}
		if si := tbl.indexFor(pl.idxCols[0]); si != nil {
			pk, found, one := si.lookupOne(keyOf(keyVal))
			if !found {
				return colNames, nil, nil
			}
			if one {
				if st.starAll {
					r, _ := tbl.getByPK(pk)
					return colNames, r, nil
				}
				pr, _ := tbl.getByPKProject(pk, pl.projOrdinals)
				return colNames, pr, nil
			}
			// found && !one: a multi-hit bucket → fall through to the general path.
		}
	}
	ctx := evalCtx{cols: tbl.def.colByName, args: args}
	var pks []UUID
	for i, ord := range pl.idxCols {
		keyVal, err := evalExpr(pl.idxSrcs[i], &ctx)
		if err != nil {
			return nil, nil, err
		}
		if keyVal.IsNull() {
			return colNames, nil, nil
		}
		si := tbl.indexFor(ord)
		if si == nil {
			return colNames, nil, nil
		}
		bucket := si.lookup(keyOf(keyVal))
		if i == 0 {
			pks = bucket
		} else {
			pks = intersectPKs(pks, bucket)
		}
		if len(pks) == 0 {
			break
		}
	}
	pred := rowMatcher(st.where, &ctx)
	try := func(pk UUID) (Row, bool) {
		return tbl.getMatchProject(pk, pred, pl.projOrdinals, st.starAll)
	}
	for _, pk := range pks {
		if r, ok := try(pk); ok {
			return colNames, r, nil
		}
	}
	for _, pk := range tbl.dirtyPKs() {
		if r, ok := try(pk); ok {
			return colNames, r, nil
		}
	}
	return colNames, nil, nil
}

// compositeCandidates resolves the candidate set for a composite prefix lookup:
// the PKs whose composite key starts with the encoded pinned prefix, UNION the
// dirty overlay. ok=false means provably empty (a NULL in the prefix, a missing
// index, or no hit and no dirty). The caller evaluates the full WHERE on every
// candidate, so a stale entry or an over-broad prefix yields no wrong row.
func (db *DB) compositeCandidates(pl *plan, ctx *evalCtx) (idxCandidateSet, bool, error) {
	tbl := pl.rt
	si := tbl.indexByOrdinals(pl.compOrdinals)
	if si == nil {
		return idxCandidateSet{}, false, nil
	}
	prefix := make([]Value, len(pl.compPrefixSrcs))
	for i, src := range pl.compPrefixSrcs {
		v, err := evalExpr(src, ctx)
		if err != nil {
			return idxCandidateSet{}, false, err
		}
		if v.IsNull() {
			return idxCandidateSet{}, false, nil
		}
		prefix[i] = v
	}
	pks := si.prefixLookup(encodeCompositeKey(prefix))
	dirty := tbl.dirtyPKs()
	if len(pks) == 0 && len(dirty) == 0 {
		return idxCandidateSet{}, false, nil
	}
	return idxCandidateSet{pks: pks, dirty: dirty}, true, nil
}

// execSelectCompositeLookup runs a SELECT whose WHERE pins a leading prefix of a
// composite index by equality. It resolves candidates through the prefix and
// reuses the shared candidate machinery (full-WHERE residual + ORDER BY sort).
func (db *DB) execSelectCompositeLookup(pl *plan, ctx *evalCtx) ([]string, []Row, error) {
	if pl.st.(*selectStmt).limit == 0 {
		return pl.colNames, nil, nil
	}
	cand, ok, err := db.compositeCandidates(pl, ctx)
	if err != nil {
		return nil, nil, err
	}
	if !ok {
		return pl.colNames, nil, nil
	}
	return db.execCandidates(pl, ctx, cand)
}

// compositeWalkArgs resolves the (snap, dirtyKey, residual) inputs orderedWalk
// needs for a composite prefix walk. ok=false means provably empty (missing
// index or a NULL in the pinned prefix). Shared by the materializing and
// streaming composite-walk entry points so both compute the walk identically.
func (db *DB) compositeWalkArgs(pl *plan, args []Value) (snap []ordEntry, dirtyKey func(Row) indexKey, residual []expr, ok bool, err error) {
	tbl := pl.rt
	si := tbl.indexByOrdinals(pl.compOrdinals)
	if si == nil {
		return nil, nil, nil, false, nil
	}
	ctx := evalCtx{cols: tbl.def.colByName, args: args}
	prefix := make([]Value, len(pl.compPrefixSrcs))
	for i, src := range pl.compPrefixSrcs {
		v, err := evalExpr(src, &ctx)
		if err != nil {
			return nil, nil, nil, false, err
		}
		if v.IsNull() {
			return nil, nil, nil, false, nil
		}
		prefix[i] = v
	}
	ords := si.ordinals
	// Reuse one cells buffer across every dirty row: buildDirtyCands calls dirtyKey
	// serially, and encodeCompositeKey copies the cells into a fresh key string, so
	// the buffer never escapes — one alloc per query instead of one per dirty row.
	cells := make([]Value, len(ords))
	dirtyKey = func(r Row) indexKey {
		for i, o := range ords {
			cells[i] = r[o]
		}
		return encodeCompositeKey(cells)
	}
	return si.snapshotPrefix(encodeCompositeKey(prefix)), dirtyKey, pl.compResidual, true, nil
}

// execSelectCompositeWalk serves WHERE <leading prefix> = ? ORDER BY <next col>
// via a composite ordered index: it walks the pinned-prefix sub-range of the
// sorted index — already ordered by the trailing column — and stops at LIMIT, so
// no sort runs. The walk reuses orderedWalk with a composite dirty key (the
// encoded tuple) so dirty rows merge into the same key space as the index
// entries. Every component is NOT NULL (planComposite's guard), so a matching
// row always has a fully-encodable key.
func (db *DB) execSelectCompositeWalk(pl *plan, args []Value) ([]string, []Row, error) {
	snap, dirtyKey, residual, ok, err := db.compositeWalkArgs(pl, args)
	if err != nil {
		return nil, nil, err
	}
	if !ok {
		return pl.colNames, nil, nil
	}
	return db.orderedWalk(pl, args, snap, dirtyKey, residual)
}

// dcand is one dirty-overlay candidate for an ordered walk: a row mutated since
// the last merge, tagged with its sort key.
type dcand struct {
	key indexKey
	row Row
}

// buildDirtyCands returns the dirty rows that pass match, each tagged with its
// sort key (keyFn) and sorted ascending, plus dirtySet — ALL dirty PKs, used to
// shadow possibly-stale index entries during the merge. Shared by every ordered
// walk (single-column, composite, join probe).
func (tbl *tableRT) buildDirtyCands(match func(Row) bool, keyFn func(Row) indexKey) ([]dcand, map[UUID]struct{}) {
	dirty := tbl.dirtyPKs()
	if len(dirty) == 0 {
		// Steady state after a merge: no overlay to shadow or sort. Skip the empty
		// map alloc + sort — mergeOrderedStreams handles a nil dc and nil dirtySet
		// (a nil-map lookup reports "not shadowed").
		return nil, nil
	}
	dirtySet := make(map[UUID]struct{}, len(dirty))
	var dc []dcand
	for _, pk := range dirty {
		if _, dup := dirtySet[pk]; dup {
			continue
		}
		dirtySet[pk] = struct{}{}
		if r, ok := tbl.getByPK(pk); ok && match(r) {
			dc = append(dc, dcand{keyFn(r), r})
		}
	}
	slices.SortFunc(dc, func(a, b dcand) int {
		if a.key.less(b.key) {
			return -1
		}
		if b.key.less(a.key) {
			return 1
		}
		return 0
	})
	return dc, dirtySet
}

// mergeOrderedStreams walks the sorted index slice snap and the pre-matched dirty
// candidates dc in the requested order (desc reverses both), skipping snap entries
// shadowed by dirtySet, and calls emitIdx(pk) for each surviving index entry and
// emitDirty(row) for each dirty row — in one merged ORDER BY order. It stops when
// done() reports enough collected, or both streams are exhausted. The consumer
// owns fetching/filtering/counting (so done() reflects rows actually kept).
func mergeOrderedStreams(snap []ordEntry, dc []dcand, dirtySet map[UUID]struct{}, desc bool, done func() bool, emitIdx func(pk UUID), emitDirty func(row Row)) {
	before := func(a, b indexKey) bool { // a comes before b in the requested order
		if desc {
			return b.less(a)
		}
		return a.less(b)
	}
	ii, dj := 0, 0
	if desc {
		ii, dj = len(snap)-1, len(dc)-1
	}
	idxOK := func() bool { return ii >= 0 && ii < len(snap) }
	dcOK := func() bool { return dj >= 0 && dj < len(dc) }
	step := func(p *int) {
		if desc {
			*p--
		} else {
			*p++
		}
	}
	for !done() {
		for idxOK() { // skip index entries shadowed by the dirty overlay
			if _, d := dirtySet[snap[ii].pk]; !d {
				break
			}
			step(&ii)
		}
		switch {
		case idxOK() && dcOK():
			if before(dc[dj].key, snap[ii].key) {
				emitDirty(dc[dj].row)
				step(&dj)
			} else {
				emitIdx(snap[ii].pk)
				step(&ii)
			}
		case dcOK():
			emitDirty(dc[dj].row)
			step(&dj)
		case idxOK():
			emitIdx(snap[ii].pk)
			step(&ii)
		default:
			return // both streams exhausted
		}
	}
}

// flipRangeOp mirrors a comparison so a `value OP col` predicate reads as
// `col flip(OP) value` (e.g. ? < age == age > ?).
func flipRangeOp(op tokenKind) tokenKind {
	switch op {
	case tkLt:
		return tkGt
	case tkLte:
		return tkGte
	case tkGt:
		return tkLt
	case tkGte:
		return tkLte
	}
	return op
}

// rangeBound is one side of a range constraint on a column: the evaluated bound
// value, its column-oriented operator, and whether it was present.
type rangeBound struct {
	val Value
	op  tokenKind
	ok  bool
}

// orderColumnRange scans the WHERE conjuncts for the lower (>= / >) and upper
// (< / <=) range bounds on the ORDER BY column (ordinal ord), evaluating each
// bound value. Either side may be absent. A `value OP col` predicate is read as
// `col flip(OP) value`. The first bound found on each side wins (a looser extra
// bound is harmless — the residual still filters).
func orderColumnRange(conj []expr, ord int, args []Value) (lo, hi rangeBound) {
	for _, c := range conj {
		b, isBin := c.(*binOp)
		if !isBin {
			continue
		}
		var op tokenKind
		var valExpr expr
		if cr, ok := b.lhs.(*colRef); ok && cr.ord == ord && isLitOrParam(b.rhs) {
			op, valExpr = b.op, b.rhs
		} else if cr, ok := b.rhs.(*colRef); ok && cr.ord == ord && isLitOrParam(b.lhs) {
			op, valExpr = flipRangeOp(b.op), b.lhs
		} else {
			continue
		}
		switch op {
		case tkGt, tkGte:
			if !lo.ok {
				if v, err := evalLitOrParamValue(valExpr, args); err == nil {
					lo = rangeBound{v, op, true}
				}
			}
		case tkLt, tkLte:
			if !hi.ok {
				if v, err := evalLitOrParamValue(valExpr, args); err == nil {
					hi = rangeBound{v, op, true}
				}
			}
		}
	}
	return
}

// seekOrderedSnap clips the ascending index snapshot to the window the WHERE's
// bounds on the ORDER BY column allow, so an ordered walk both STARTS at the lower
// bound and STOPS at the upper bound instead of scanning the rest of the index —
// each cut an O(log n) seek. The window is direction-independent: ASC walks it
// front-to-back, DESC back-to-front. The full WHERE stays the residual matcher, so
// a missing / kind-mismatched / NULL bound just skips that cut (the residual still
// filters); only entries that provably fail a bound are dropped.
func seekOrderedSnap(snap []ordEntry, conj []expr, ord int, args []Value) []ordEntry {
	if len(snap) == 0 {
		return snap
	}
	lo, hi := orderColumnRange(conj, ord, args)
	start, end := 0, len(snap)
	if lo.ok && !lo.val.IsNull() {
		if lk := keyOf(lo.val); lk.kind == snap[0].key.kind {
			if lo.op == tkGt { // first key > lo
				start = sort.Search(len(snap), func(i int) bool { return lk.less(snap[i].key) })
			} else { // first key >= lo
				start = sort.Search(len(snap), func(i int) bool { return !snap[i].key.less(lk) })
			}
		}
	}
	if hi.ok && !hi.val.IsNull() {
		if hk := keyOf(hi.val); hk.kind == snap[0].key.kind {
			if hi.op == tkLt { // first key >= hi
				end = sort.Search(len(snap), func(i int) bool { return !snap[i].key.less(hk) })
			} else { // first key > hi
				end = sort.Search(len(snap), func(i int) bool { return hk.less(snap[i].key) })
			}
		}
	}
	if start >= end { // empty or contradictory window
		return snap[:0]
	}
	return snap[start:end]
}

// orderedWalkArgs resolves the (snap, dirtyKey, residual) inputs orderedWalk
// needs for a single-column ordered walk. ok=false means a missing index. The
// index sits on the ORDER BY column, not the WHERE columns, so the snap
// guarantees nothing about the WHERE: the whole WHERE is residual. When the WHERE
// bounds the ORDER BY column by a range, the snap is clipped to that window so the
// walk skips the entries before AND after it. Shared by the materializing and
// streaming entry points.
func (db *DB) orderedWalkArgs(pl *plan, args []Value) (snap []ordEntry, dirtyKey func(Row) indexKey, residual []expr, ok bool) {
	si := pl.rt.indexFor(pl.orderOrdinal)
	if si == nil {
		return nil, nil, nil, false
	}
	ord := pl.orderOrdinal
	st := pl.st.(*selectStmt)
	collectConjuncts(st.where, &residual)
	snap = seekOrderedSnap(si.snapshot(), residual, ord, args)
	return snap, func(r Row) indexKey { return keyOf(r[ord]) }, residual, true
}

// execSelectOrderedWalk serves an ORDER BY on an ordered-indexed column by
// walking the sorted index in order, merged with the dirty overlay (rows
// mutated since the last merge), applying any residual WHERE, and stopping at
// LIMIT — touching ~LIMIT rows, not the whole table. A non-dirty index entry is
// fresh (its key equals the live value), so the index key drives the ordering
// and the row is fetched only when selected. Dirty PKs are excluded from the
// index walk (the entry may be stale) and supplied from their live rows.
func (db *DB) execSelectOrderedWalk(pl *plan, args []Value) ([]string, []Row, error) {
	snap, dirtyKey, residual, ok := db.orderedWalkArgs(pl, args)
	if !ok {
		return pl.colNames, nil, nil
	}
	return db.orderedWalk(pl, args, snap, dirtyKey, residual)
}

// orderedWalk merges a sorted index slice (snap, ascending key order) with the
// dirty overlay and emits rows in ORDER BY order, stopping at LIMIT — touching
// ~LIMIT rows, not the whole set. dirtyKey maps a dirty row to its sort key in
// snap's key space (single-column: keyOf(orderOrdinal); composite: the encoded
// tuple), so both streams merge on one comparator. A non-dirty snap entry is
// fresh, so its key drives the ordering and the row is fetched only when
// selected; dirty PKs are excluded from the snap walk (their entry may be stale)
// and supplied from their live rows.
//
// residual is the WHERE conjuncts NOT already guaranteed by the snap (for a
// single-column walk that is the whole WHERE; for a composite prefix walk it
// excludes the pinned equalities the sub-range already enforces). An empty
// residual means index rows need no per-row check, so they take the one-clone
// getByPKProject fast path. Dirty rows are always filtered by the FULL WHERE
// (buildDirtyCands' match) — the snap guarantees nothing about an unmerged row.
func (db *DB) orderedWalk(pl *plan, args []Value, snap []ordEntry, dirtyKey func(Row) indexKey, residual []expr) ([]string, []Row, error) {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	colNames := pl.colNames
	if st.limit == 0 {
		return colNames, nil, nil
	}
	ctx := evalCtx{cols: tbl.def.colByName, args: args}
	matches := rowMatcher(st.where, &ctx)            // full WHERE — filters dirty candidates
	passResidual := conjunctsMatcher(residual, &ctx) // conjuncts the snap doesn't guarantee
	dc, dirtySet := tbl.buildDirtyCands(matches, dirtyKey)

	eff := fetchBound(st.limit, st.offset) // collect offset+limit, drop offset at the end
	width := len(pl.projOrdinals)
	if st.starAll {
		width = len(tbl.def.def.Columns)
	}
	// A bounded walk (eff>=0) carves every kept row from ONE presized backing — one
	// allocation for the result instead of one per row; it collects exactly eff rows
	// (done() stops the merge), so the backing never regrows. An unbounded walk
	// projects per row (a regrowing shared backing would retain superseded
	// generations). Mirrors the join ordered-walk paths.
	bounded := eff >= 0
	var packed []Value
	capHint := eff
	if !bounded || capHint > 1024 {
		capHint = 1024
	}
	if bounded {
		packed = make([]Value, 0, eff*width)
	}
	out := make([]Row, 0, capHint)
	done := func() bool { return eff >= 0 && len(out) >= eff }
	emitIdx := func(pk UUID) {
		// nil pred when the snap fully satisfies the filter (no residual): the fetch
		// just projects. With a residual, appendMatchProject re-checks it under the
		// shard lock and clones only the projection in one pass.
		var pred func(Row) bool
		if len(residual) != 0 {
			pred = passResidual
		}
		if bounded {
			start := len(packed)
			var ok bool
			if packed, ok = tbl.appendMatchProject(pk, pred, pl.projOrdinals, st.starAll, packed); ok {
				out = append(out, Row(packed[start:len(packed):len(packed)]))
			}
			return
		}
		if cells, ok := tbl.appendMatchProject(pk, pred, pl.projOrdinals, st.starAll, make([]Value, 0, width)); ok {
			out = append(out, cells)
		}
	}
	emitDirty := func(row Row) { // dc rows are owned clones, already matched
		out, packed = appendProjected(out, packed, row, pl.projOrdinals, st.starAll, bounded)
	}
	mergeOrderedStreams(snap, dc, dirtySet, st.orderDesc, done, emitIdx, emitDirty)
	return colNames, sliceOffsetLimit(out, st.offset, st.limit), nil
}

// orderedWalkEach is the streaming form of orderedWalk: it drives the SAME
// snap+dirty merge in the SAME ORDER BY order (mergeOrderedStreams + the same
// residual/dirty-shadow semantics), but instead of materializing a []Row with a
// per-row clone it visits each selected row in place. Index rows are visited
// under their shard RLock (valid only for that call, per the streaming
// memory-safety contract); dirty rows are visited from their owned clone. OFFSET
// and LIMIT are applied as rows are visited — a stream cannot post-slice. visit
// returns false to stop early. Ordered walks only exist on non-partitioned
// tables (ordered indexes are barred on partitioned ones), so the live row is
// reached via the per-shard pk map, as the idxLookup stream does.
func (db *DB) orderedWalkEach(pl *plan, args []Value, snap []ordEntry, dirtyKey func(Row) indexKey, residual []expr, visit func(row Row) bool) error {
	st := pl.st.(*selectStmt)
	tbl := pl.rt
	if st.limit == 0 {
		return nil
	}
	ctx := evalCtx{cols: tbl.def.colByName, args: args}
	matches := rowMatcher(st.where, &ctx)            // full WHERE — filters dirty candidates
	passResidual := conjunctsMatcher(residual, &ctx) // conjuncts the snap doesn't guarantee
	dc, dirtySet := tbl.buildDirtyCands(matches, dirtyKey)

	var scratch Row
	if !st.starAll {
		scratch = make(Row, len(pl.projOrdinals))
	}
	skipped, n, stop := 0, 0, false
	// deliver applies OFFSET (skip the first offset qualified rows) and LIMIT to
	// one qualified row, projecting into the reused scratch (Value headers only,
	// valid for this call) and visiting it. It sets stop on early-exit or once
	// LIMIT is reached; mergeOrderedStreams' done() then halts the walk.