From 435fa7ec0f1897d3925bd5985bbeb7fd2d68f0e1 Mon Sep 17 00:00:00 2001 From: Suhani Nagpal Date: Wed, 13 May 2026 14:37:52 +0530 Subject: [PATCH] fix(lancedb): tag search span as RETRIEVER and populate input/output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LanceDB instrumentor emits a span for table.search().to_list() / to_pyarrow() but never sets the FI canonical retriever keys. Future AGI dashboard shows Type=unknown with empty Input/Output panels. LanceDB uses a query-builder pattern where the actual query lives on the builder instance as `_query` / `_limit` rather than in args/kwargs. The wrapper now reads from those attributes. Changes (all in traceai_lancedb/_wrappers.py) - Optional `fi_instrumentation.fi_types` import with raw-string fallback. - In `SearchWrapper.__call__`: - Set `gen_ai.span.kind = "RETRIEVER"`. - Build an input summary {limit, output_format} extended with either `query` (capped at 500 chars, for text queries) or `vector_dim` (for vector queries) based on what's on the builder instance. - Set `input.value` as the JSON summary with `input.mime_type = application/json`. - After the wrapped call, set `output.value` based on output_format: - `to_list` → JSON of the first 50 rows - PyArrow table (`num_rows` attribute) → JSON via `to_pylist()` or `to_pydict()` of the first 50 rows - Set `output.mime_type = application/json` in both cases. Add/Update/Delete/CreateTable/DropTable/OpenTable are untouched. Verified end-to-end via Future AGI MCP. `lancedb search` span now shows Type=Retriever in the dashboard with populated Input/Output. --- .../lancedb/traceai_lancedb/_wrappers.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/python/frameworks/lancedb/traceai_lancedb/_wrappers.py b/python/frameworks/lancedb/traceai_lancedb/_wrappers.py index e6f97452..0cbd760a 100644 --- a/python/frameworks/lancedb/traceai_lancedb/_wrappers.py +++ b/python/frameworks/lancedb/traceai_lancedb/_wrappers.py @@ -6,6 +6,24 @@ from opentelemetry.trace import SpanKind, Status, StatusCode, Tracer +# FI canonical span-kind / IO keys. Optional dependency. +try: + from fi_instrumentation.fi_types import FiSpanKindValues, SpanAttributes + + _FI_SPAN_KIND = SpanAttributes.FI_SPAN_KIND + _FI_INPUT_VALUE = SpanAttributes.INPUT_VALUE + _FI_INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE + _FI_OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE + _FI_OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE + _FI_RETRIEVER = FiSpanKindValues.RETRIEVER.value +except Exception: # pragma: no cover + _FI_SPAN_KIND = "gen_ai.span.kind" + _FI_INPUT_VALUE = "input.value" + _FI_INPUT_MIME_TYPE = "input.mime_type" + _FI_OUTPUT_VALUE = "output.value" + _FI_OUTPUT_MIME_TYPE = "output.mime_type" + _FI_RETRIEVER = "RETRIEVER" + logger = logging.getLogger(__name__) @@ -54,14 +72,53 @@ def __call__(self, wrapped: Callable, instance: Any, args: tuple, kwargs: dict) attributes["db.vector.query.top_k"] = limit attributes["db.vector.search.output_format"] = self._method + # FI canonical retriever attributes. + attributes[_FI_SPAN_KIND] = _FI_RETRIEVER + # LanceDB uses a query-builder pattern; the actual query data is on + # the builder instance. Surface what we can find safely. + query_value = getattr(instance, "_query", None) or getattr( + instance, "_text", None + ) + input_summary: dict = {"limit": limit, "output_format": self._method} + if isinstance(query_value, str): + input_summary["query"] = query_value[:500] + elif isinstance(query_value, (list, tuple)): + input_summary["vector_dim"] = len(query_value) + attributes[_FI_INPUT_VALUE] = safe_json_dumps(input_summary) + attributes[_FI_INPUT_MIME_TYPE] = "application/json" + with self._tracer.start_as_current_span("lancedb search", kind=SpanKind.CLIENT, attributes=attributes) as span: try: result = wrapped(*args, **kwargs) if result is not None: if self._method == "to_list": span.set_attribute("db.vector.results.count", len(result)) + span.set_attribute( + _FI_OUTPUT_VALUE, safe_json_dumps(result[:50]) + ) + span.set_attribute( + _FI_OUTPUT_MIME_TYPE, "application/json" + ) elif hasattr(result, "num_rows"): span.set_attribute("db.vector.results.count", result.num_rows) + # Best-effort: convert pyarrow table to a dict list. + try: + if hasattr(result, "to_pylist"): + rows = result.to_pylist() + elif hasattr(result, "to_pydict"): + rows = result.to_pydict() + else: + rows = None + if rows is not None: + span.set_attribute( + _FI_OUTPUT_VALUE, + safe_json_dumps(rows[:50] if isinstance(rows, list) else rows), + ) + span.set_attribute( + _FI_OUTPUT_MIME_TYPE, "application/json" + ) + except Exception: + pass span.set_status(Status(StatusCode.OK)) return result except Exception as e: