From c125d641daf43dc4abeab651b0d51777078ff175 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 16:39:32 +0800 Subject: [PATCH 01/24] Update embedders --- meilisearch/index.py | 77 ++++++++++++++++++++++++++++++++----- meilisearch/models/index.py | 52 ++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 11 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index ade11797..8b0ac209 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -35,6 +35,8 @@ ProximityPrecision, TypoTolerance, UserProvidedEmbedder, + OllamaEmbedder, + RestEmbedder, ) from meilisearch.models.task import Task, TaskInfo, TaskResults from meilisearch.task import TaskHandler @@ -990,9 +992,29 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ if body.get("embedders"): - for _, v in body["embedders"].items(): - if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: - del v["documentTemplateMaxBytes"] + for embedder_name, embedder_config in body["embedders"].items(): + # Validate source field + source = embedder_config.get("source") + if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: + raise ValueError( + f"Invalid source for embedder '{embedder_name}'. " + f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." + ) + + # Validate required fields for REST embedder + if source == "rest" and ( + "request" not in embedder_config or "response" not in embedder_config + ): + raise ValueError( + f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." + ) + + # Clean up None values for optional fields + if ( + "documentTemplateMaxBytes" in embedder_config + and embedder_config["documentTemplateMaxBytes"] is None + ): + del embedder_config["documentTemplateMaxBytes"] task = self.http.patch( f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body @@ -1881,13 +1903,30 @@ def get_embedders(self) -> Embedders | None: if not response: return None - embedders: dict[str, OpenAiEmbedder | HuggingFaceEmbedder | UserProvidedEmbedder] = {} + embedders: dict[ + str, + Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder, + ], + ] = {} for k, v in response.items(): - if v.get("source") == "openAi": + source = v.get("source") + if source == "openAi": embedders[k] = OpenAiEmbedder(**v) - elif v.get("source") == "huggingFace": + elif source == "huggingFace": embedders[k] = HuggingFaceEmbedder(**v) + elif source == "ollama": + embedders[k] = OllamaEmbedder(**v) + elif source == "rest": + embedders[k] = RestEmbedder(**v) + elif source == "userProvided": + embedders[k] = UserProvidedEmbedder(**v) else: + # Default to UserProvidedEmbedder for unknown sources embedders[k] = UserProvidedEmbedder(**v) return Embedders(embedders=embedders) @@ -1913,9 +1952,29 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI """ if body: - for _, v in body.items(): - if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: - del v["documentTemplateMaxBytes"] + for embedder_name, embedder_config in body.items(): + # Validate source field + source = embedder_config.get("source") + if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: + raise ValueError( + f"Invalid source for embedder '{embedder_name}'. " + f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." + ) + + # Validate required fields for REST embedder + if source == "rest" and ( + "request" not in embedder_config or "response" not in embedder_config + ): + raise ValueError( + f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." + ) + + # Clean up None values for optional fields + if ( + "documentTemplateMaxBytes" in embedder_config + and embedder_config["documentTemplateMaxBytes"] is None + ): + del embedder_config["documentTemplateMaxBytes"] task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body) diff --git a/meilisearch/models/index.py b/meilisearch/models/index.py index bee521ed..0589675f 100644 --- a/meilisearch/models/index.py +++ b/meilisearch/models/index.py @@ -59,27 +59,75 @@ class LocalizedAttributes(CamelBase): locales: List[str] +class Distribution(CamelBase): + mean: float + sigma: float + + class OpenAiEmbedder(CamelBase): source: str = "openAi" + url: Optional[str] = None + api_key: Optional[str] = None # Can be provided through a CLI option or environment variable model: Optional[str] = None # Defaults to text-embedding-3-small dimensions: Optional[int] = None # Uses the model default - api_key: Optional[str] = None # Can be provided through a CLI option or environment variable document_template: Optional[str] = None document_template_max_bytes: Optional[int] = None # Default to 400 + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None class HuggingFaceEmbedder(CamelBase): source: str = "huggingFace" + url: Optional[str] = None + api_key: Optional[str] = None model: Optional[str] = None # Defaults to BAAI/bge-base-en-v1.5 + dimensions: Optional[int] = None revision: Optional[str] = None document_template: Optional[str] = None document_template_max_bytes: Optional[int] = None # Default to 400 + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class OllamaEmbedder(CamelBase): + source: str = "ollama" + url: Optional[str] = None + api_key: Optional[str] = None + model: Optional[str] = None + dimensions: Optional[int] = None + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class RestEmbedder(CamelBase): + source: str = "rest" + url: Optional[str] = None + api_key: Optional[str] = None + dimensions: Optional[int] = None + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None + request: Dict[str, Any] + response: Dict[str, Any] + headers: Optional[Dict[str, str]] = None + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None class UserProvidedEmbedder(CamelBase): source: str = "userProvided" dimensions: int + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None class Embedders(CamelBase): - embedders: Dict[str, Union[OpenAiEmbedder, HuggingFaceEmbedder, UserProvidedEmbedder]] + embedders: Dict[ + str, + Union[ + OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder + ], + ] From 7f61d2cee69ee53d1b282fd68e948f2d2f60eb1e Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 16:50:12 +0800 Subject: [PATCH 02/24] Update embedders models --- meilisearch/models/embedders.py | 211 ++++++++++++++++++++++++++++++++ meilisearch/models/index.py | 84 ++----------- 2 files changed, 221 insertions(+), 74 deletions(-) create mode 100644 meilisearch/models/embedders.py diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py new file mode 100644 index 00000000..0f1b95db --- /dev/null +++ b/meilisearch/models/embedders.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union + +from camel_converter.pydantic_base import CamelBase + + +class Distribution(CamelBase): + """Distribution settings for embedders. + + Parameters + ---------- + mean: float + Mean value between 0 and 1 + sigma: float + Sigma value between 0 and 1 + """ + + mean: float + sigma: float + + +class OpenAiEmbedder(CamelBase): + """OpenAI embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "openAi" + url: Optional[str] + The URL Meilisearch contacts when querying the embedder + api_key: Optional[str] + Authentication token Meilisearch should send with each request to the embedder + model: Optional[str] + The model your embedder uses when generating vectors (defaults to text-embedding-3-small) + dimensions: Optional[int] + Number of dimensions in the chosen model + document_template: Optional[str] + Template defining the data Meilisearch sends to the embedder + document_template_max_bytes: Optional[int] + Maximum allowed size of rendered document template (defaults to 400) + distribution: Optional[Distribution] + Describes the natural distribution of search results + binary_quantized: Optional[bool] + Once set to true, irreversibly converts all vector dimensions to 1-bit values + """ + + source: str = "openAi" + url: Optional[str] = None + api_key: Optional[str] = None + model: Optional[str] = None # Defaults to text-embedding-3-small + dimensions: Optional[int] = None # Uses the model default + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None # Default to 400 + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class HuggingFaceEmbedder(CamelBase): + """HuggingFace embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "huggingFace" + url: Optional[str] + The URL Meilisearch contacts when querying the embedder + api_key: Optional[str] + Authentication token Meilisearch should send with each request to the embedder + model: Optional[str] + The model your embedder uses when generating vectors (defaults to BAAI/bge-base-en-v1.5) + dimensions: Optional[int] + Number of dimensions in the chosen model + revision: Optional[str] + Model revision hash + document_template: Optional[str] + Template defining the data Meilisearch sends to the embedder + document_template_max_bytes: Optional[int] + Maximum allowed size of rendered document template (defaults to 400) + distribution: Optional[Distribution] + Describes the natural distribution of search results + binary_quantized: Optional[bool] + Once set to true, irreversibly converts all vector dimensions to 1-bit values + """ + + source: str = "huggingFace" + url: Optional[str] = None + api_key: Optional[str] = None + model: Optional[str] = None # Defaults to BAAI/bge-base-en-v1.5 + dimensions: Optional[int] = None + revision: Optional[str] = None + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None # Default to 400 + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class OllamaEmbedder(CamelBase): + """Ollama embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "ollama" + url: Optional[str] + The URL Meilisearch contacts when querying the embedder (defaults to http://localhost:11434/api/embeddings) + api_key: Optional[str] + Authentication token Meilisearch should send with each request to the embedder + model: Optional[str] + The model your embedder uses when generating vectors + dimensions: Optional[int] + Number of dimensions in the chosen model + document_template: Optional[str] + Template defining the data Meilisearch sends to the embedder + document_template_max_bytes: Optional[int] + Maximum allowed size of rendered document template (defaults to 400) + distribution: Optional[Distribution] + Describes the natural distribution of search results + binary_quantized: Optional[bool] + Once set to true, irreversibly converts all vector dimensions to 1-bit values + """ + + source: str = "ollama" + url: Optional[str] = None + api_key: Optional[str] = None + model: Optional[str] = None + dimensions: Optional[int] = None + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class RestEmbedder(CamelBase): + """REST API embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "rest" + url: Optional[str] + The URL Meilisearch contacts when querying the embedder + api_key: Optional[str] + Authentication token Meilisearch should send with each request to the embedder + dimensions: Optional[int] + Number of dimensions in the chosen model + document_template: Optional[str] + Template defining the data Meilisearch sends to the embedder + document_template_max_bytes: Optional[int] + Maximum allowed size of rendered document template (defaults to 400) + request: Dict[str, Any] + A JSON value representing the request Meilisearch makes to the remote embedder + response: Dict[str, Any] + A JSON value representing the request Meilisearch expects from the remote embedder + headers: Optional[Dict[str, str]] + Custom headers to send with the request + distribution: Optional[Distribution] + Describes the natural distribution of search results + binary_quantized: Optional[bool] + Once set to true, irreversibly converts all vector dimensions to 1-bit values + """ + + source: str = "rest" + url: Optional[str] = None + api_key: Optional[str] = None + dimensions: Optional[int] = None + document_template: Optional[str] = None + document_template_max_bytes: Optional[int] = None + request: Dict[str, Any] + response: Dict[str, Any] + headers: Optional[Dict[str, str]] = None + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class UserProvidedEmbedder(CamelBase): + """User-provided embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "userProvided" + dimensions: int + Number of dimensions in the embeddings + distribution: Optional[Distribution] + Describes the natural distribution of search results + binary_quantized: Optional[bool] + Once set to true, irreversibly converts all vector dimensions to 1-bit values + """ + + source: str = "userProvided" + dimensions: int + distribution: Optional[Distribution] = None + binary_quantized: Optional[bool] = None + + +class Embedders(CamelBase): + """Container for embedder configurations. + + Parameters + ---------- + embedders: Dict[str, Union[OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder]] + Dictionary of embedder configurations, where keys are embedder names + """ + + embedders: Dict[ + str, + Union[ + OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder + ], + ] diff --git a/meilisearch/models/index.py b/meilisearch/models/index.py index 0589675f..74ddd26b 100644 --- a/meilisearch/models/index.py +++ b/meilisearch/models/index.py @@ -6,6 +6,16 @@ from camel_converter import to_snake from camel_converter.pydantic_base import CamelBase +from meilisearch.models.embedders import ( + Distribution, + Embedders, + HuggingFaceEmbedder, + OllamaEmbedder, + OpenAiEmbedder, + RestEmbedder, + UserProvidedEmbedder, +) + class IndexStats: __dict: Dict @@ -57,77 +67,3 @@ class ProximityPrecision(str, Enum): class LocalizedAttributes(CamelBase): attribute_patterns: List[str] locales: List[str] - - -class Distribution(CamelBase): - mean: float - sigma: float - - -class OpenAiEmbedder(CamelBase): - source: str = "openAi" - url: Optional[str] = None - api_key: Optional[str] = None # Can be provided through a CLI option or environment variable - model: Optional[str] = None # Defaults to text-embedding-3-small - dimensions: Optional[int] = None # Uses the model default - document_template: Optional[str] = None - document_template_max_bytes: Optional[int] = None # Default to 400 - distribution: Optional[Distribution] = None - binary_quantized: Optional[bool] = None - - -class HuggingFaceEmbedder(CamelBase): - source: str = "huggingFace" - url: Optional[str] = None - api_key: Optional[str] = None - model: Optional[str] = None # Defaults to BAAI/bge-base-en-v1.5 - dimensions: Optional[int] = None - revision: Optional[str] = None - document_template: Optional[str] = None - document_template_max_bytes: Optional[int] = None # Default to 400 - distribution: Optional[Distribution] = None - binary_quantized: Optional[bool] = None - - -class OllamaEmbedder(CamelBase): - source: str = "ollama" - url: Optional[str] = None - api_key: Optional[str] = None - model: Optional[str] = None - dimensions: Optional[int] = None - document_template: Optional[str] = None - document_template_max_bytes: Optional[int] = None - distribution: Optional[Distribution] = None - binary_quantized: Optional[bool] = None - - -class RestEmbedder(CamelBase): - source: str = "rest" - url: Optional[str] = None - api_key: Optional[str] = None - dimensions: Optional[int] = None - document_template: Optional[str] = None - document_template_max_bytes: Optional[int] = None - request: Dict[str, Any] - response: Dict[str, Any] - headers: Optional[Dict[str, str]] = None - distribution: Optional[Distribution] = None - binary_quantized: Optional[bool] = None - - -class UserProvidedEmbedder(CamelBase): - source: str = "userProvided" - dimensions: int - document_template: Optional[str] = None - document_template_max_bytes: Optional[int] = None - distribution: Optional[Distribution] = None - binary_quantized: Optional[bool] = None - - -class Embedders(CamelBase): - embedders: Dict[ - str, - Union[ - OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder - ], - ] From 93e8f69e0965eaae8990d3159612851781ff12f2 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 16:56:38 +0800 Subject: [PATCH 03/24] Add docs --- meilisearch/index.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 8b0ac209..bd17a19b 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -1888,10 +1888,13 @@ def reset_non_separator_tokens(self) -> TaskInfo: def get_embedders(self) -> Embedders | None: """Get embedders of the index. + Retrieves the current embedder configuration from Meilisearch. + Returns ------- - settings: - The embedders settings of the index. + Embedders: + The embedders settings of the index, or None if no embedders are configured. + Contains a dictionary of embedder configurations, where keys are embedder names. Raises ------ @@ -1934,11 +1937,14 @@ def get_embedders(self) -> Embedders | None: def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskInfo: """Update embedders of the index. + Updates the embedder configuration for the index. The embedder configuration + determines how Meilisearch generates vector embeddings for documents. + Parameters ---------- body: dict - Dictionary containing the embedders. - + Dictionary containing the embedders configuration. Each key represents an embedder name, + and the value is a dictionary with the embedder configuration. Returns ------- task_info: @@ -1969,6 +1975,18 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." ) + # Validate required fields for UserProvided embedder + if source == "userProvided" and "dimensions" not in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." + ) + + # Validate that documentTemplate is not used with userProvided + if source == "userProvided" and "documentTemplate" in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." + ) + # Clean up None values for optional fields if ( "documentTemplateMaxBytes" in embedder_config @@ -1983,6 +2001,8 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI def reset_embedders(self) -> TaskInfo: """Reset embedders of the index to default values. + Removes all embedder configurations from the index. + Returns ------- task_info: From d3aa65b4d82e1a3296278aba53978b667a53a808 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 17:04:55 +0800 Subject: [PATCH 04/24] Allow updating embedders via update_settings --- meilisearch/index.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/meilisearch/index.py b/meilisearch/index.py index bd17a19b..158962b7 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -977,6 +977,26 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: ---------- body: Dictionary containing the settings of the index. + Supported settings include: + - 'rankingRules': List of ranking rules + - 'distinctAttribute': Attribute for deduplication + - 'searchableAttributes': Attributes that can be searched + - 'displayedAttributes': Attributes to display in search results + - 'stopWords': Words ignored in search queries + - 'synonyms': Dictionary of synonyms + - 'filterableAttributes': Attributes that can be used for filtering + - 'sortableAttributes': Attributes that can be used for sorting + - 'typoTolerance': Settings for typo tolerance + - 'pagination': Settings for pagination + - 'faceting': Settings for faceting + - 'dictionary': List of custom dictionary words + - 'separatorTokens': List of separator tokens + - 'nonSeparatorTokens': List of non-separator tokens + - 'embedders': Dictionary of embedder configurations for AI-powered search + - 'searchCutoffMs': Maximum search time in milliseconds + - 'proximityPrecision': Precision for proximity ranking + - 'localizedAttributes': Settings for localized attributes + More information: https://www.meilisearch.com/docs/reference/api/settings#update-settings @@ -1009,6 +1029,18 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." ) + # Validate required fields for UserProvided embedder + if source == "userProvided" and "dimensions" not in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." + ) + + # Validate that documentTemplate is not used with userProvided + if source == "userProvided" and "documentTemplate" in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." + ) + # Clean up None values for optional fields if ( "documentTemplateMaxBytes" in embedder_config From 2fcbc4709acadd9eddcdc21f48dc9accdf6c0d24 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 17:13:05 +0800 Subject: [PATCH 05/24] Refactor config validation to avoid duplicate code --- meilisearch/index.py | 134 ++++++++++++++-------------- meilisearch/models/embedders.py | 152 ++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 68 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 158962b7..f9b4b5e5 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -968,6 +968,58 @@ def get_settings(self) -> Dict[str, Any]: return settings + def __validate_embedder_config( + self, embedder_name: str, embedder_config: MutableMapping[str, Any] + ) -> None: + """Validate an embedder configuration. + + Parameters + ---------- + embedder_name: str + The name of the embedder + embedder_config: dict + The embedder configuration to validate + + Raises + ------ + ValueError + If the embedder configuration is invalid + """ + # Validate source field + source = embedder_config.get("source") + if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: + raise ValueError( + f"Invalid source for embedder '{embedder_name}'. " + f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." + ) + + # Validate required fields for REST embedder + if source == "rest" and ( + "request" not in embedder_config or "response" not in embedder_config + ): + raise ValueError( + f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." + ) + + # Validate required fields for UserProvided embedder + if source == "userProvided" and "dimensions" not in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." + ) + + # Validate that documentTemplate is not used with userProvided + if source == "userProvided" and "documentTemplate" in embedder_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." + ) + + # Clean up None values for optional fields + if ( + "documentTemplateMaxBytes" in embedder_config + and embedder_config["documentTemplateMaxBytes"] is None + ): + del embedder_config["documentTemplateMaxBytes"] + def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: """Update settings of the index. @@ -1008,45 +1060,14 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: Raises ------ + ValueError + If the provided embedder configuration is invalid. MeilisearchApiError An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ if body.get("embedders"): for embedder_name, embedder_config in body["embedders"].items(): - # Validate source field - source = embedder_config.get("source") - if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: - raise ValueError( - f"Invalid source for embedder '{embedder_name}'. " - f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." - ) - - # Validate required fields for REST embedder - if source == "rest" and ( - "request" not in embedder_config or "response" not in embedder_config - ): - raise ValueError( - f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." - ) - - # Validate required fields for UserProvided embedder - if source == "userProvided" and "dimensions" not in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." - ) - - # Validate that documentTemplate is not used with userProvided - if source == "userProvided" and "documentTemplate" in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." - ) - - # Clean up None values for optional fields - if ( - "documentTemplateMaxBytes" in embedder_config - and embedder_config["documentTemplateMaxBytes"] is None - ): - del embedder_config["documentTemplateMaxBytes"] + self.__validate_embedder_config(embedder_name, embedder_config) task = self.http.patch( f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body @@ -1977,6 +1998,14 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI body: dict Dictionary containing the embedders configuration. Each key represents an embedder name, and the value is a dictionary with the embedder configuration. + + Supported embedder sources: + - 'openAi': OpenAI embedder + - 'huggingFace': HuggingFace embedder + - 'ollama': Ollama embedder + - 'rest': REST API embedder + - 'userProvided': User-provided embedder + Returns ------- task_info: @@ -1985,46 +2014,15 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI Raises ------ + ValueError + If the provided embedder configuration is invalid. MeilisearchApiError An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ if body: for embedder_name, embedder_config in body.items(): - # Validate source field - source = embedder_config.get("source") - if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: - raise ValueError( - f"Invalid source for embedder '{embedder_name}'. " - f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." - ) - - # Validate required fields for REST embedder - if source == "rest" and ( - "request" not in embedder_config or "response" not in embedder_config - ): - raise ValueError( - f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." - ) - - # Validate required fields for UserProvided embedder - if source == "userProvided" and "dimensions" not in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." - ) - - # Validate that documentTemplate is not used with userProvided - if source == "userProvided" and "documentTemplate" in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." - ) - - # Clean up None values for optional fields - if ( - "documentTemplateMaxBytes" in embedder_config - and embedder_config["documentTemplateMaxBytes"] is None - ): - del embedder_config["documentTemplateMaxBytes"] + self.__validate_embedder_config(embedder_name, embedder_config) task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 0f1b95db..eb4a1dc1 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -55,6 +55,25 @@ class OpenAiEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None + @classmethod + def validate_config(cls, name: str, config: Dict[str, Any]) -> None: + """Validate the configuration for an OpenAI embedder. + + Parameters + ---------- + name: str + The name of the embedder + config: Dict[str, Any] + The configuration to validate + + Raises + ------ + ValueError + If the configuration is invalid + """ + if config.get("source") != "openAi": + raise ValueError(f"Embedder '{name}' must have source 'openAi'") + class HuggingFaceEmbedder(CamelBase): """HuggingFace embedder configuration. @@ -94,6 +113,25 @@ class HuggingFaceEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None + @classmethod + def validate_config(cls, name: str, config: Dict[str, Any]) -> None: + """Validate the configuration for a HuggingFace embedder. + + Parameters + ---------- + name: str + The name of the embedder + config: Dict[str, Any] + The configuration to validate + + Raises + ------ + ValueError + If the configuration is invalid + """ + if config.get("source") != "huggingFace": + raise ValueError(f"Embedder '{name}' must have source 'huggingFace'") + class OllamaEmbedder(CamelBase): """Ollama embedder configuration. @@ -130,6 +168,25 @@ class OllamaEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None + @classmethod + def validate_config(cls, name: str, config: Dict[str, Any]) -> None: + """Validate the configuration for an Ollama embedder. + + Parameters + ---------- + name: str + The name of the embedder + config: Dict[str, Any] + The configuration to validate + + Raises + ------ + ValueError + If the configuration is invalid + """ + if config.get("source") != "ollama": + raise ValueError(f"Embedder '{name}' must have source 'ollama'") + class RestEmbedder(CamelBase): """REST API embedder configuration. @@ -172,6 +229,31 @@ class RestEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None + @classmethod + def validate_config(cls, name: str, config: Dict[str, Any]) -> None: + """Validate the configuration for a REST embedder. + + Parameters + ---------- + name: str + The name of the embedder + config: Dict[str, Any] + The configuration to validate + + Raises + ------ + ValueError + If the configuration is invalid + """ + if config.get("source") != "rest": + raise ValueError(f"Embedder '{name}' must have source 'rest'") + + if "request" not in config: + raise ValueError(f"Embedder '{name}' with source 'rest' must include 'request' field") + + if "response" not in config: + raise ValueError(f"Embedder '{name}' with source 'rest' must include 'response' field") + class UserProvidedEmbedder(CamelBase): """User-provided embedder configuration. @@ -193,6 +275,35 @@ class UserProvidedEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None + @classmethod + def validate_config(cls, name: str, config: Dict[str, Any]) -> None: + """Validate the configuration for a user-provided embedder. + + Parameters + ---------- + name: str + The name of the embedder + config: Dict[str, Any] + The configuration to validate + + Raises + ------ + ValueError + If the configuration is invalid + """ + if config.get("source") != "userProvided": + raise ValueError(f"Embedder '{name}' must have source 'userProvided'") + + if "dimensions" not in config: + raise ValueError( + f"Embedder '{name}' with source 'userProvided' must include 'dimensions' field" + ) + + if "documentTemplate" in config: + raise ValueError( + f"Embedder '{name}' with source 'userProvided' cannot include 'documentTemplate' field" + ) + class Embedders(CamelBase): """Container for embedder configurations. @@ -209,3 +320,44 @@ class Embedders(CamelBase): OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder ], ] + + @classmethod + def validate_config(cls, config: Dict[str, Dict[str, Any]]) -> None: + """Validate the configuration for embedders. + + Parameters + ---------- + config: Dict[str, Dict[str, Any]] + The configuration to validate, where keys are embedder names and values are embedder configurations + + Raises + ------ + ValueError + If the configuration is invalid + """ + for name, embedder_config in config.items(): + source = embedder_config.get("source") + if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: + raise ValueError( + f"Invalid source for embedder '{name}'. " + f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." + ) + + # Clean up None values for optional fields + if ( + "documentTemplateMaxBytes" in embedder_config + and embedder_config["documentTemplateMaxBytes"] is None + ): + del embedder_config["documentTemplateMaxBytes"] + + # Validate based on source + if source == "openAi": + OpenAiEmbedder.validate_config(name, embedder_config) + elif source == "huggingFace": + HuggingFaceEmbedder.validate_config(name, embedder_config) + elif source == "ollama": + OllamaEmbedder.validate_config(name, embedder_config) + elif source == "rest": + RestEmbedder.validate_config(name, embedder_config) + elif source == "userProvided": + UserProvidedEmbedder.validate_config(name, embedder_config) From f9258e9e5dfd88385d448bc74a95983ea87e3da3 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 17:27:29 +0800 Subject: [PATCH 06/24] Update validation code --- meilisearch/index.py | 105 +++++-------- meilisearch/models/embedders.py | 252 +++++++++++++------------------- 2 files changed, 138 insertions(+), 219 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index f9b4b5e5..4ac116fb 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -25,18 +25,22 @@ from meilisearch.errors import version_error_hint_message from meilisearch.models.document import Document, DocumentsResults from meilisearch.models.index import ( - Embedders, Faceting, - HuggingFaceEmbedder, IndexStats, LocalizedAttributes, - OpenAiEmbedder, Pagination, ProximityPrecision, TypoTolerance, - UserProvidedEmbedder, +) +from meilisearch.models.embedders import ( + Distribution, + Embedders, + HuggingFaceEmbedder, OllamaEmbedder, + OpenAiEmbedder, RestEmbedder, + UserProvidedEmbedder, + validate_embedders, ) from meilisearch.models.task import Task, TaskInfo, TaskResults from meilisearch.task import TaskHandler @@ -968,58 +972,6 @@ def get_settings(self) -> Dict[str, Any]: return settings - def __validate_embedder_config( - self, embedder_name: str, embedder_config: MutableMapping[str, Any] - ) -> None: - """Validate an embedder configuration. - - Parameters - ---------- - embedder_name: str - The name of the embedder - embedder_config: dict - The embedder configuration to validate - - Raises - ------ - ValueError - If the embedder configuration is invalid - """ - # Validate source field - source = embedder_config.get("source") - if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: - raise ValueError( - f"Invalid source for embedder '{embedder_name}'. " - f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." - ) - - # Validate required fields for REST embedder - if source == "rest" and ( - "request" not in embedder_config or "response" not in embedder_config - ): - raise ValueError( - f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." - ) - - # Validate required fields for UserProvided embedder - if source == "userProvided" and "dimensions" not in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." - ) - - # Validate that documentTemplate is not used with userProvided - if source == "userProvided" and "documentTemplate" in embedder_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' cannot include 'documentTemplate' field." - ) - - # Clean up None values for optional fields - if ( - "documentTemplateMaxBytes" in embedder_config - and embedder_config["documentTemplateMaxBytes"] is None - ): - del embedder_config["documentTemplateMaxBytes"] - def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: """Update settings of the index. @@ -1063,14 +1015,18 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: ValueError If the provided embedder configuration is invalid. MeilisearchApiError - An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors + An error containing details about why Meilisearch can't process your request. + Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ - if body.get("embedders"): - for embedder_name, embedder_config in body["embedders"].items(): - self.__validate_embedder_config(embedder_name, embedder_config) + # Create a copy of the body to avoid modifying the original + body_copy = body.copy() + + # Validate embedders if present + if "embedders" in body_copy: + body_copy["embedders"] = validate_embedders(body_copy["embedders"]) task = self.http.patch( - f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body + f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body_copy ) return TaskInfo(**task) @@ -2006,6 +1962,22 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI - 'rest': REST API embedder - 'userProvided': User-provided embedder + Required fields depend on the embedder source: + - 'rest' requires 'request' and 'response' fields + - 'userProvided' requires 'dimensions' field + + Optional fields (availability depends on source): + - 'url': The URL Meilisearch contacts when querying the embedder + - 'apiKey': Authentication token for the embedder + - 'model': The model used for generating vectors + - 'documentTemplate': Template defining the data sent to the embedder + - 'documentTemplateMaxBytes': Maximum size of rendered document template + - 'dimensions': Number of dimensions in the chosen model + - 'revision': Model revision hash (only for 'huggingFace') + - 'distribution': Object with 'mean' and 'sigma' fields + - 'binaryQuantized': Boolean to convert vector dimensions to 1-bit values + - 'headers': Custom headers for requests (only for 'rest') + Returns ------- task_info: @@ -2017,14 +1989,13 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI ValueError If the provided embedder configuration is invalid. MeilisearchApiError - An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors + An error containing details about why Meilisearch can't process your request. + Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ + # Validate embedders + validated_body = validate_embedders(body) if body else None - if body: - for embedder_name, embedder_config in body.items(): - self.__validate_embedder_config(embedder_name, embedder_config) - - task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body) + task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), validated_body) return TaskInfo(**task) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index eb4a1dc1..f0dcc0f6 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -1,10 +1,13 @@ from __future__ import annotations -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Mapping, MutableMapping, Type, TypeVar from camel_converter.pydantic_base import CamelBase +T = TypeVar("T", bound="CamelBase") + + class Distribution(CamelBase): """Distribution settings for embedders. @@ -55,25 +58,6 @@ class OpenAiEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - @classmethod - def validate_config(cls, name: str, config: Dict[str, Any]) -> None: - """Validate the configuration for an OpenAI embedder. - - Parameters - ---------- - name: str - The name of the embedder - config: Dict[str, Any] - The configuration to validate - - Raises - ------ - ValueError - If the configuration is invalid - """ - if config.get("source") != "openAi": - raise ValueError(f"Embedder '{name}' must have source 'openAi'") - class HuggingFaceEmbedder(CamelBase): """HuggingFace embedder configuration. @@ -113,25 +97,6 @@ class HuggingFaceEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - @classmethod - def validate_config(cls, name: str, config: Dict[str, Any]) -> None: - """Validate the configuration for a HuggingFace embedder. - - Parameters - ---------- - name: str - The name of the embedder - config: Dict[str, Any] - The configuration to validate - - Raises - ------ - ValueError - If the configuration is invalid - """ - if config.get("source") != "huggingFace": - raise ValueError(f"Embedder '{name}' must have source 'huggingFace'") - class OllamaEmbedder(CamelBase): """Ollama embedder configuration. @@ -168,25 +133,6 @@ class OllamaEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - @classmethod - def validate_config(cls, name: str, config: Dict[str, Any]) -> None: - """Validate the configuration for an Ollama embedder. - - Parameters - ---------- - name: str - The name of the embedder - config: Dict[str, Any] - The configuration to validate - - Raises - ------ - ValueError - If the configuration is invalid - """ - if config.get("source") != "ollama": - raise ValueError(f"Embedder '{name}' must have source 'ollama'") - class RestEmbedder(CamelBase): """REST API embedder configuration. @@ -229,31 +175,6 @@ class RestEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - @classmethod - def validate_config(cls, name: str, config: Dict[str, Any]) -> None: - """Validate the configuration for a REST embedder. - - Parameters - ---------- - name: str - The name of the embedder - config: Dict[str, Any] - The configuration to validate - - Raises - ------ - ValueError - If the configuration is invalid - """ - if config.get("source") != "rest": - raise ValueError(f"Embedder '{name}' must have source 'rest'") - - if "request" not in config: - raise ValueError(f"Embedder '{name}' with source 'rest' must include 'request' field") - - if "response" not in config: - raise ValueError(f"Embedder '{name}' with source 'rest' must include 'response' field") - class UserProvidedEmbedder(CamelBase): """User-provided embedder configuration. @@ -275,35 +196,6 @@ class UserProvidedEmbedder(CamelBase): distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - @classmethod - def validate_config(cls, name: str, config: Dict[str, Any]) -> None: - """Validate the configuration for a user-provided embedder. - - Parameters - ---------- - name: str - The name of the embedder - config: Dict[str, Any] - The configuration to validate - - Raises - ------ - ValueError - If the configuration is invalid - """ - if config.get("source") != "userProvided": - raise ValueError(f"Embedder '{name}' must have source 'userProvided'") - - if "dimensions" not in config: - raise ValueError( - f"Embedder '{name}' with source 'userProvided' must include 'dimensions' field" - ) - - if "documentTemplate" in config: - raise ValueError( - f"Embedder '{name}' with source 'userProvided' cannot include 'documentTemplate' field" - ) - class Embedders(CamelBase): """Container for embedder configurations. @@ -321,43 +213,99 @@ class Embedders(CamelBase): ], ] - @classmethod - def validate_config(cls, config: Dict[str, Dict[str, Any]]) -> None: - """Validate the configuration for embedders. - - Parameters - ---------- - config: Dict[str, Dict[str, Any]] - The configuration to validate, where keys are embedder names and values are embedder configurations - - Raises - ------ - ValueError - If the configuration is invalid - """ - for name, embedder_config in config.items(): - source = embedder_config.get("source") - if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: - raise ValueError( - f"Invalid source for embedder '{name}'. " - f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." - ) - - # Clean up None values for optional fields - if ( - "documentTemplateMaxBytes" in embedder_config - and embedder_config["documentTemplateMaxBytes"] is None - ): - del embedder_config["documentTemplateMaxBytes"] - - # Validate based on source - if source == "openAi": - OpenAiEmbedder.validate_config(name, embedder_config) - elif source == "huggingFace": - HuggingFaceEmbedder.validate_config(name, embedder_config) - elif source == "ollama": - OllamaEmbedder.validate_config(name, embedder_config) - elif source == "rest": - RestEmbedder.validate_config(name, embedder_config) - elif source == "userProvided": - UserProvidedEmbedder.validate_config(name, embedder_config) + +def validate_embedder_config(embedder_name: str, config: Dict[str, Any]) -> Dict[str, Any]: + """Validate an embedder configuration. + + Parameters + ---------- + embedder_name: str + The name of the embedder + config: Dict[str, Any] + The embedder configuration + + Returns + ------- + Dict[str, Any] + The validated and cleaned embedder configuration + + Raises + ------ + ValueError + If the configuration is invalid + """ + # Validate source field + source = config.get("source") + if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: + raise ValueError( + f"Invalid source for embedder '{embedder_name}'. " + f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." + ) + + # Create a copy of the config to avoid modifying the original + cleaned_config = config.copy() + + # Validate based on source type + if source == "openAi": + OpenAiEmbedder(**cleaned_config) + elif source == "huggingFace": + HuggingFaceEmbedder(**cleaned_config) + elif source == "ollama": + OllamaEmbedder(**cleaned_config) + elif source == "rest": + # Validate required fields for REST embedder + if "request" not in cleaned_config or "response" not in cleaned_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." + ) + RestEmbedder(**cleaned_config) + elif source == "userProvided": + # Validate required fields for UserProvided embedder + if "dimensions" not in cleaned_config: + raise ValueError( + f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." + ) + + # Remove fields not supported by UserProvided + for field in ["documentTemplate", "documentTemplateMaxBytes"]: + if field in cleaned_config: + del cleaned_config[field] + + UserProvidedEmbedder(**cleaned_config) + + # Clean up None values for optional fields + if ( + "documentTemplateMaxBytes" in cleaned_config + and cleaned_config["documentTemplateMaxBytes"] is None + ): + del cleaned_config["documentTemplateMaxBytes"] + + return cleaned_config + + +def validate_embedders(embedders: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Validate a dictionary of embedder configurations. + + Parameters + ---------- + embedders: MutableMapping[str, Any] + Dictionary of embedder configurations + + Returns + ------- + MutableMapping[str, Any] + The validated and cleaned embedder configurations + + Raises + ------ + ValueError + If any configuration is invalid + """ + if not embedders: + return embedders + + cleaned_embedders = {} + for embedder_name, config in embedders.items(): + cleaned_embedders[embedder_name] = validate_embedder_config(embedder_name, config) + + return cleaned_embedders From 8a4369d4d41779f6d8f58007a45a94e4e587c203 Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 17:35:23 +0800 Subject: [PATCH 07/24] Remove validation to let meilisearch handle it --- meilisearch/index.py | 19 +----- meilisearch/models/embedders.py | 102 +------------------------------- 2 files changed, 3 insertions(+), 118 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 4ac116fb..94085835 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -40,7 +40,6 @@ OpenAiEmbedder, RestEmbedder, UserProvidedEmbedder, - validate_embedders, ) from meilisearch.models.task import Task, TaskInfo, TaskResults from meilisearch.task import TaskHandler @@ -1012,21 +1011,12 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: Raises ------ - ValueError - If the provided embedder configuration is invalid. MeilisearchApiError An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ - # Create a copy of the body to avoid modifying the original - body_copy = body.copy() - - # Validate embedders if present - if "embedders" in body_copy: - body_copy["embedders"] = validate_embedders(body_copy["embedders"]) - task = self.http.patch( - f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body_copy + f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body ) return TaskInfo(**task) @@ -1986,16 +1976,11 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI Raises ------ - ValueError - If the provided embedder configuration is invalid. MeilisearchApiError An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ - # Validate embedders - validated_body = validate_embedders(body) if body else None - - task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), validated_body) + task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body) return TaskInfo(**task) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index f0dcc0f6..0f1b95db 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -1,13 +1,10 @@ from __future__ import annotations -from typing import Any, Dict, List, Optional, Union, Mapping, MutableMapping, Type, TypeVar +from typing import Any, Dict, List, Optional, Union from camel_converter.pydantic_base import CamelBase -T = TypeVar("T", bound="CamelBase") - - class Distribution(CamelBase): """Distribution settings for embedders. @@ -212,100 +209,3 @@ class Embedders(CamelBase): OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder ], ] - - -def validate_embedder_config(embedder_name: str, config: Dict[str, Any]) -> Dict[str, Any]: - """Validate an embedder configuration. - - Parameters - ---------- - embedder_name: str - The name of the embedder - config: Dict[str, Any] - The embedder configuration - - Returns - ------- - Dict[str, Any] - The validated and cleaned embedder configuration - - Raises - ------ - ValueError - If the configuration is invalid - """ - # Validate source field - source = config.get("source") - if source not in ["openAi", "huggingFace", "ollama", "rest", "userProvided"]: - raise ValueError( - f"Invalid source for embedder '{embedder_name}'. " - f"Must be one of: 'openAi', 'huggingFace', 'ollama', 'rest', 'userProvided'." - ) - - # Create a copy of the config to avoid modifying the original - cleaned_config = config.copy() - - # Validate based on source type - if source == "openAi": - OpenAiEmbedder(**cleaned_config) - elif source == "huggingFace": - HuggingFaceEmbedder(**cleaned_config) - elif source == "ollama": - OllamaEmbedder(**cleaned_config) - elif source == "rest": - # Validate required fields for REST embedder - if "request" not in cleaned_config or "response" not in cleaned_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'rest' must include 'request' and 'response' fields." - ) - RestEmbedder(**cleaned_config) - elif source == "userProvided": - # Validate required fields for UserProvided embedder - if "dimensions" not in cleaned_config: - raise ValueError( - f"Embedder '{embedder_name}' with source 'userProvided' must include 'dimensions' field." - ) - - # Remove fields not supported by UserProvided - for field in ["documentTemplate", "documentTemplateMaxBytes"]: - if field in cleaned_config: - del cleaned_config[field] - - UserProvidedEmbedder(**cleaned_config) - - # Clean up None values for optional fields - if ( - "documentTemplateMaxBytes" in cleaned_config - and cleaned_config["documentTemplateMaxBytes"] is None - ): - del cleaned_config["documentTemplateMaxBytes"] - - return cleaned_config - - -def validate_embedders(embedders: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - """Validate a dictionary of embedder configurations. - - Parameters - ---------- - embedders: MutableMapping[str, Any] - Dictionary of embedder configurations - - Returns - ------- - MutableMapping[str, Any] - The validated and cleaned embedder configurations - - Raises - ------ - ValueError - If any configuration is invalid - """ - if not embedders: - return embedders - - cleaned_embedders = {} - for embedder_name, config in embedders.items(): - cleaned_embedders[embedder_name] = validate_embedder_config(embedder_name, config) - - return cleaned_embedders From 742ef5ece89f3bd11dd5b00a42a4d3919632f33e Mon Sep 17 00:00:00 2001 From: Strift Date: Sat, 8 Mar 2025 17:43:38 +0800 Subject: [PATCH 08/24] Remove unused parameters --- meilisearch/models/embedders.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 0f1b95db..b449b61e 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -65,8 +65,6 @@ class HuggingFaceEmbedder(CamelBase): The embedder source, must be "huggingFace" url: Optional[str] The URL Meilisearch contacts when querying the embedder - api_key: Optional[str] - Authentication token Meilisearch should send with each request to the embedder model: Optional[str] The model your embedder uses when generating vectors (defaults to BAAI/bge-base-en-v1.5) dimensions: Optional[int] @@ -85,7 +83,6 @@ class HuggingFaceEmbedder(CamelBase): source: str = "huggingFace" url: Optional[str] = None - api_key: Optional[str] = None model: Optional[str] = None # Defaults to BAAI/bge-base-en-v1.5 dimensions: Optional[int] = None revision: Optional[str] = None @@ -143,7 +140,7 @@ class RestEmbedder(CamelBase): api_key: Optional[str] Authentication token Meilisearch should send with each request to the embedder dimensions: Optional[int] - Number of dimensions in the chosen model + Number of dimensions in the embeddings document_template: Optional[str] Template defining the data Meilisearch sends to the embedder document_template_max_bytes: Optional[int] From c4e26d77b891580dbfcb99e862600bf9acd71b09 Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 12:29:59 +0800 Subject: [PATCH 09/24] Add hybrid search --- README.md | 21 ++++++++++++++++++++ meilisearch/index.py | 7 +++++-- meilisearch/models/__init__.py | 1 + tests/index/test_index_search_meilisearch.py | 9 +++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7bfe6193..c072c3e0 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,27 @@ JSON output: } ``` +#### Hybrid Search + +Hybrid search combines traditional keyword search with semantic search for more relevant results. You need to have an embedder configured in your index settings to use this feature. + +```python +# Using hybrid search with the search method +index.search( + 'action movie', + { + "hybrid": {"semanticRatio": 0.5, "embedder": "default"} + } +) +``` + +The `semanticRatio` parameter (between 0 and 1) controls the balance between keyword search and semantic search: +- 0: Only keyword search +- 1: Only semantic search +- Values in between: A mix of both approaches + +The `embedder` parameter specifies which configured embedder to use for the semantic search component. + #### Custom Search With Filters If you want to enable filtering, you must add your attributes to the `filterableAttributes` index setting. diff --git a/meilisearch/index.py b/meilisearch/index.py index 94085835..aefe37ec 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -280,14 +280,15 @@ def get_stats(self) -> IndexStats: def search(self, query: str, opt_params: Optional[Mapping[str, Any]] = None) -> Dict[str, Any]: """Search in the index. + https://www.meilisearch.com/docs/reference/api/search + Parameters ---------- query: String containing the searched word(s) opt_params (optional): Dictionary containing optional query parameters. - Note: The vector parameter is only available in Meilisearch >= v1.13.0 - https://www.meilisearch.com/docs/reference/api/search#search-in-an-index + For hybrid search, include a 'hybrid' object with 'semanticRatio' and 'embedder' fields. Returns ------- @@ -301,7 +302,9 @@ def search(self, query: str, opt_params: Optional[Mapping[str, Any]] = None) -> """ if opt_params is None: opt_params = {} + body = {"q": query, **opt_params} + return self.http.post( f"{self.config.paths.index}/{self.uid}/{self.config.paths.search}", body=body, diff --git a/meilisearch/models/__init__.py b/meilisearch/models/__init__.py index e69de29b..8b137891 100644 --- a/meilisearch/models/__init__.py +++ b/meilisearch/models/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/index/test_index_search_meilisearch.py b/tests/index/test_index_search_meilisearch.py index 8939e9c5..78a31adc 100644 --- a/tests/index/test_index_search_meilisearch.py +++ b/tests/index/test_index_search_meilisearch.py @@ -503,6 +503,7 @@ def test_show_ranking_score(index_with_documents): def test_vector_search(index_with_documents_and_vectors): + """Tests vector search with hybrid parameters.""" response = index_with_documents_and_vectors().search( "", opt_params={"vector": [0.1, 0.2], "hybrid": {"semanticRatio": 1.0, "embedder": "default"}}, @@ -510,6 +511,14 @@ def test_vector_search(index_with_documents_and_vectors): assert len(response["hits"]) > 0 +def test_hybrid_search(index_with_documents_and_vectors): + """Tests hybrid search with semantic ratio and embedder.""" + response = index_with_documents_and_vectors().search( + "movie", opt_params={"hybrid": {"semanticRatio": 0.5, "embedder": "default"}} + ) + assert len(response["hits"]) > 0 + + def test_search_distinct(index_with_documents): index_with_documents().update_filterable_attributes(["genre"]) response = index_with_documents().search("with", {"distinct": "genre"}) From 5e954acfebee453f56acc346ac515b202aacc60d Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 12:37:44 +0800 Subject: [PATCH 10/24] Add test for retrieving vectors --- meilisearch/index.py | 8 +++++++- tests/index/test_index_search_meilisearch.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index aefe37ec..0bcc3125 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -288,7 +288,13 @@ def search(self, query: str, opt_params: Optional[Mapping[str, Any]] = None) -> String containing the searched word(s) opt_params (optional): Dictionary containing optional query parameters. - For hybrid search, include a 'hybrid' object with 'semanticRatio' and 'embedder' fields. + Common parameters include: + - hybrid: Dict with 'semanticRatio' and 'embedder' fields for hybrid search + - vector: Array of numbers for vector search + - retrieveVectors: Boolean to include vector data in search results + - filter: Filter queries by an attribute's value + - limit: Maximum number of documents returned + - offset: Number of documents to skip Returns ------- diff --git a/tests/index/test_index_search_meilisearch.py b/tests/index/test_index_search_meilisearch.py index 78a31adc..ea35bcad 100644 --- a/tests/index/test_index_search_meilisearch.py +++ b/tests/index/test_index_search_meilisearch.py @@ -543,3 +543,18 @@ def test_search_ranking_threshold(query, ranking_score_threshold, expected, inde query, {"rankingScoreThreshold": ranking_score_threshold} ) assert len(response["hits"]) == expected + + +def test_vector_search_with_retrieve_vectors(index_with_documents_and_vectors): + """Tests vector search with retrieveVectors parameter.""" + response = index_with_documents_and_vectors().search( + "", + opt_params={ + "vector": [0.1, 0.2], + "retrieveVectors": True, + "hybrid": {"semanticRatio": 1.0, "embedder": "default"}, + }, + ) + assert len(response["hits"]) > 0 + # Check that _vectors field is present in the response + assert "_vectors" in response["hits"][0] From 05291f433455993a70b9843f50f04d46e15a368b Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 12:41:16 +0800 Subject: [PATCH 11/24] Add semanticHitCount test --- tests/index/test_index_search_meilisearch.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/index/test_index_search_meilisearch.py b/tests/index/test_index_search_meilisearch.py index ea35bcad..54d66ebd 100644 --- a/tests/index/test_index_search_meilisearch.py +++ b/tests/index/test_index_search_meilisearch.py @@ -509,6 +509,10 @@ def test_vector_search(index_with_documents_and_vectors): opt_params={"vector": [0.1, 0.2], "hybrid": {"semanticRatio": 1.0, "embedder": "default"}}, ) assert len(response["hits"]) > 0 + # Check that semanticHitCount field is present in the response + assert "semanticHitCount" in response + # With semanticRatio = 1.0, all hits should be semantic + assert response["semanticHitCount"] == len(response["hits"]) def test_hybrid_search(index_with_documents_and_vectors): @@ -517,6 +521,10 @@ def test_hybrid_search(index_with_documents_and_vectors): "movie", opt_params={"hybrid": {"semanticRatio": 0.5, "embedder": "default"}} ) assert len(response["hits"]) > 0 + # Check that semanticHitCount field is present in the response + assert "semanticHitCount" in response + # semanticHitCount should be an integer + assert isinstance(response["semanticHitCount"], int) def test_search_distinct(index_with_documents): From b064b0b4308e17e42d517152953c8bff96752f02 Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 12:52:21 +0800 Subject: [PATCH 12/24] Update comment --- tests/index/test_index_search_meilisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/index/test_index_search_meilisearch.py b/tests/index/test_index_search_meilisearch.py index 54d66ebd..85972b0c 100644 --- a/tests/index/test_index_search_meilisearch.py +++ b/tests/index/test_index_search_meilisearch.py @@ -564,5 +564,5 @@ def test_vector_search_with_retrieve_vectors(index_with_documents_and_vectors): }, ) assert len(response["hits"]) > 0 - # Check that _vectors field is present in the response + # Check that _vectors field is present in the response hits assert "_vectors" in response["hits"][0] From d5d928e3b9ec5adbea5fd37070956651c6d3fdf7 Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 13:08:40 +0800 Subject: [PATCH 13/24] Add test for similar documents --- tests/index/test_index_search_meilisearch.py | 54 +++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/index/test_index_search_meilisearch.py b/tests/index/test_index_search_meilisearch.py index 85972b0c..8b087d1c 100644 --- a/tests/index/test_index_search_meilisearch.py +++ b/tests/index/test_index_search_meilisearch.py @@ -564,5 +564,57 @@ def test_vector_search_with_retrieve_vectors(index_with_documents_and_vectors): }, ) assert len(response["hits"]) > 0 - # Check that _vectors field is present in the response hits + # Check that the first hit has a _vectors field assert "_vectors" in response["hits"][0] + # Check that the _vectors field contains the default embedder + assert "default" in response["hits"][0]["_vectors"] + + +def test_get_similar_documents_with_identical_vectors(empty_index): + """Tests get_similar_documents method with documents having identical vectors.""" + # Create documents with identical vector embeddings + identical_vector = [0.5, 0.5] + documents = [ + {"id": "doc1", "title": "Document 1", "_vectors": {"default": identical_vector}}, + {"id": "doc2", "title": "Document 2", "_vectors": {"default": identical_vector}}, + {"id": "doc3", "title": "Document 3", "_vectors": {"default": identical_vector}}, + # Add a document with a different vector to verify it's not returned first + {"id": "doc4", "title": "Document 4", "_vectors": {"default": [0.1, 0.1]}}, + ] + + # Set up the index with the documents + index = empty_index() + + # Configure the embedder + settings_update_task = index.update_embedders( + { + "default": { + "source": "userProvided", + "dimensions": 2, + } + } + ) + index.wait_for_task(settings_update_task.task_uid) + + # Add the documents + document_addition_task = index.add_documents(documents) + index.wait_for_task(document_addition_task.task_uid) + + # Test get_similar_documents with doc1 + response = index.get_similar_documents({"id": "doc1", "embedder": "default"}) + + # Verify response structure + assert isinstance(response, dict) + assert "hits" in response + assert len(response["hits"]) >= 2 # Should find at least doc2 and doc3 + assert "id" in response + assert response["id"] == "doc1" + + # Verify that doc2 and doc3 are in the results (they have identical vectors to doc1) + result_ids = [hit["id"] for hit in response["hits"]] + assert "doc2" in result_ids + assert "doc3" in result_ids + + # Verify that doc4 is not the first result (it has a different vector) + if "doc4" in result_ids: + assert result_ids[0] != "doc4" From b49cb424054adc2fb4c2c83da1b84358b7741fa6 Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 13:41:02 +0800 Subject: [PATCH 14/24] Fix linters errors --- meilisearch/index.py | 1 - meilisearch/models/__init__.py | 1 - meilisearch/models/embedders.py | 2 +- meilisearch/models/index.py | 12 +----------- tests/conftest.py | 2 +- tests/settings/test_settings.py | 2 +- tests/settings/test_settings_embedders.py | 2 +- 7 files changed, 5 insertions(+), 17 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 0bcc3125..2b92bbe8 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -33,7 +33,6 @@ TypoTolerance, ) from meilisearch.models.embedders import ( - Distribution, Embedders, HuggingFaceEmbedder, OllamaEmbedder, diff --git a/meilisearch/models/__init__.py b/meilisearch/models/__init__.py index 8b137891..e69de29b 100644 --- a/meilisearch/models/__init__.py +++ b/meilisearch/models/__init__.py @@ -1 +0,0 @@ - diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index b449b61e..84ad5918 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Optional, Union from camel_converter.pydantic_base import CamelBase diff --git a/meilisearch/models/index.py b/meilisearch/models/index.py index 74ddd26b..5c7ca995 100644 --- a/meilisearch/models/index.py +++ b/meilisearch/models/index.py @@ -1,21 +1,11 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional from camel_converter import to_snake from camel_converter.pydantic_base import CamelBase -from meilisearch.models.embedders import ( - Distribution, - Embedders, - HuggingFaceEmbedder, - OllamaEmbedder, - OpenAiEmbedder, - RestEmbedder, - UserProvidedEmbedder, -) - class IndexStats: __dict: Dict diff --git a/tests/conftest.py b/tests/conftest.py index 2ec280de..295de52d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import meilisearch from meilisearch.errors import MeilisearchApiError -from meilisearch.models.index import OpenAiEmbedder, UserProvidedEmbedder +from meilisearch.models.embedders import OpenAiEmbedder, UserProvidedEmbedder from tests import common diff --git a/tests/settings/test_settings.py b/tests/settings/test_settings.py index 8bbdafa2..147001de 100644 --- a/tests/settings/test_settings.py +++ b/tests/settings/test_settings.py @@ -1,7 +1,7 @@ # pylint: disable=redefined-outer-name import pytest -from meilisearch.models.index import OpenAiEmbedder, UserProvidedEmbedder +from meilisearch.models.embedders import OpenAiEmbedder, UserProvidedEmbedder @pytest.fixture diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index f932bbae..ab45e77c 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -1,6 +1,6 @@ # pylint: disable=redefined-outer-name -from meilisearch.models.index import OpenAiEmbedder, UserProvidedEmbedder +from meilisearch.models.embedders import OpenAiEmbedder, UserProvidedEmbedder def test_get_default_embedders(empty_index): From ef1b7717a7f46802e552635e240dbe2c4b1ebe2f Mon Sep 17 00:00:00 2001 From: Strift Date: Sun, 9 Mar 2025 13:51:27 +0800 Subject: [PATCH 15/24] Sort imports --- meilisearch/index.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 2b92bbe8..7b083d4c 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -24,14 +24,6 @@ from meilisearch.config import Config from meilisearch.errors import version_error_hint_message from meilisearch.models.document import Document, DocumentsResults -from meilisearch.models.index import ( - Faceting, - IndexStats, - LocalizedAttributes, - Pagination, - ProximityPrecision, - TypoTolerance, -) from meilisearch.models.embedders import ( Embedders, HuggingFaceEmbedder, @@ -40,6 +32,14 @@ RestEmbedder, UserProvidedEmbedder, ) +from meilisearch.models.index import ( + Faceting, + IndexStats, + LocalizedAttributes, + Pagination, + ProximityPrecision, + TypoTolerance, +) from meilisearch.models.task import Task, TaskInfo, TaskResults from meilisearch.task import TaskHandler From 297b3e4bbd4c41a2c8b0380384f8a1b559c75b23 Mon Sep 17 00:00:00 2001 From: Laurent Cazanove Date: Thu, 20 Mar 2025 07:05:03 +0100 Subject: [PATCH 16/24] Update meilisearch/models/embedders.py Co-authored-by: Bruno Casali --- meilisearch/models/embedders.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 84ad5918..fe082e50 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -1,7 +1,5 @@ from __future__ import annotations - from typing import Any, Dict, Optional, Union - from camel_converter.pydantic_base import CamelBase From c7c1700224ad37bca280dc385a9ca1e1057abbb1 Mon Sep 17 00:00:00 2001 From: Strift Date: Thu, 20 Mar 2025 14:41:25 +0800 Subject: [PATCH 17/24] Avoid repeating embedder type --- meilisearch/index.py | 14 +++----------- meilisearch/models/embedders.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 7b083d4c..3fa28210 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -26,6 +26,7 @@ from meilisearch.models.document import Document, DocumentsResults from meilisearch.models.embedders import ( Embedders, + EmbedderType, HuggingFaceEmbedder, OllamaEmbedder, OpenAiEmbedder, @@ -966,7 +967,7 @@ def get_settings(self) -> Dict[str, Any]: ) if settings.get("embedders"): - embedders: dict[str, OpenAiEmbedder | HuggingFaceEmbedder | UserProvidedEmbedder] = {} + embedders: dict[str, EmbedderType] = {} for k, v in settings["embedders"].items(): if v.get("source") == "openAi": embedders[k] = OpenAiEmbedder(**v) @@ -1913,16 +1914,7 @@ def get_embedders(self) -> Embedders | None: if not response: return None - embedders: dict[ - str, - Union[ - OpenAiEmbedder, - HuggingFaceEmbedder, - OllamaEmbedder, - RestEmbedder, - UserProvidedEmbedder, - ], - ] = {} + embedders: dict[str, EmbedderType] = {} for k, v in response.items(): source = v.get("source") if source == "openAi": diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index fe082e50..01ba7b3c 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -1,5 +1,7 @@ from __future__ import annotations + from typing import Any, Dict, Optional, Union + from camel_converter.pydantic_base import CamelBase @@ -189,6 +191,16 @@ class UserProvidedEmbedder(CamelBase): binary_quantized: Optional[bool] = None +# Type alias for the embedder union type +EmbedderType = Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder, +] + + class Embedders(CamelBase): """Container for embedder configurations. @@ -198,9 +210,4 @@ class Embedders(CamelBase): Dictionary of embedder configurations, where keys are embedder names """ - embedders: Dict[ - str, - Union[ - OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder - ], - ] + embedders: Dict[str, EmbedderType] From b1258c70da42432a1025d11b2c955161e027f37f Mon Sep 17 00:00:00 2001 From: Strift Date: Thu, 20 Mar 2025 14:49:46 +0800 Subject: [PATCH 18/24] Remove docs --- meilisearch/index.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 3fa28210..d0948a8e 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -1942,31 +1942,7 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI Parameters ---------- body: dict - Dictionary containing the embedders configuration. Each key represents an embedder name, - and the value is a dictionary with the embedder configuration. - - Supported embedder sources: - - 'openAi': OpenAI embedder - - 'huggingFace': HuggingFace embedder - - 'ollama': Ollama embedder - - 'rest': REST API embedder - - 'userProvided': User-provided embedder - - Required fields depend on the embedder source: - - 'rest' requires 'request' and 'response' fields - - 'userProvided' requires 'dimensions' field - - Optional fields (availability depends on source): - - 'url': The URL Meilisearch contacts when querying the embedder - - 'apiKey': Authentication token for the embedder - - 'model': The model used for generating vectors - - 'documentTemplate': Template defining the data sent to the embedder - - 'documentTemplateMaxBytes': Maximum size of rendered document template - - 'dimensions': Number of dimensions in the chosen model - - 'revision': Model revision hash (only for 'huggingFace') - - 'distribution': Object with 'mean' and 'sigma' fields - - 'binaryQuantized': Boolean to convert vector dimensions to 1-bit values - - 'headers': Custom headers for requests (only for 'rest') + Dictionary containing the embedders configuration. Returns ------- From 8960bc22ba3cf0674cc68fcafab2c7282d84eda2 Mon Sep 17 00:00:00 2001 From: Strift Date: Thu, 20 Mar 2025 14:55:01 +0800 Subject: [PATCH 19/24] Add unintentionally removed --- meilisearch/index.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/meilisearch/index.py b/meilisearch/index.py index d0948a8e..dadba130 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -1024,6 +1024,11 @@ def update_settings(self, body: MutableMapping[str, Any]) -> TaskInfo: An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ + if body.get("embedders"): + for _, v in body["embedders"].items(): + if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: + del v["documentTemplateMaxBytes"] + task = self.http.patch( f"{self.config.paths.index}/{self.uid}/{self.config.paths.setting}", body ) @@ -1956,6 +1961,11 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ + if body.get("embedders"): + for _, v in body["embedders"].items(): + if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: + del v["documentTemplateMaxBytes"] + task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body) return TaskInfo(**task) From 268aa4cd14a00ef376df0deafb754c2e55736abc Mon Sep 17 00:00:00 2001 From: Strift Date: Thu, 20 Mar 2025 15:05:23 +0800 Subject: [PATCH 20/24] Fix mypy issues --- meilisearch/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index dadba130..1ff211fe 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -1961,7 +1961,7 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ - if body.get("embedders"): + if body is not None and body.get("embedders"): for _, v in body["embedders"].items(): if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: del v["documentTemplateMaxBytes"] From d8825aa13a242101338636306f810e81b0e8c08c Mon Sep 17 00:00:00 2001 From: Strift Date: Wed, 26 Mar 2025 16:09:58 +0800 Subject: [PATCH 21/24] Add test for embedders fields --- tests/settings/test_settings_embedders.py | 135 ++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index ab45e77c..cb5d6c92 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -42,3 +42,138 @@ def test_reset_embedders(new_embedders, empty_index): assert isinstance(response_get.embedders["open_ai"], OpenAiEmbedder) response_last = index.get_embedders() assert response_last is None + + +def test_embedder_format_fields(empty_index): + """Tests that each embedder type has the required fields and proper format.""" + index = empty_index() + + # Test OpenAi embedder + openai_embedder = { + "openai": { + "source": "openAi", + "api_key": "test-key", + "model": "text-embedding-3-small", + "dimensions": 1536, + "document_template": "{{title}}", + "document_template_max_bytes": 400, + "distribution": {"mean": 0.5, "sigma": 0.1}, + "binary_quantized": False, + } + } + response = index.update_embedders(openai_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + assert embedders.embedders["openai"].source == "openAi" + assert embedders.embedders["openai"].api_key == "test-key" + assert embedders.embedders["openai"].model == "text-embedding-3-small" + assert embedders.embedders["openai"].dimensions == 1536 + assert embedders.embedders["openai"].document_template == "{{title}}" + assert embedders.embedders["openai"].document_template_max_bytes == 400 + assert embedders.embedders["openai"].distribution.mean == 0.5 + assert embedders.embedders["openai"].distribution.sigma == 0.1 + assert embedders.embedders["openai"].binary_quantized is False + + # Test HuggingFace embedder + huggingface_embedder = { + "huggingface": { + "source": "huggingFace", + "model": "BAAI/bge-base-en-v1.5", + "dimensions": 768, + "revision": "main", + "document_template": "{{title}}", + "document_template_max_bytes": 400, + "distribution": {"mean": 0.5, "sigma": 0.1}, + "binary_quantized": False, + } + } + response = index.update_embedders(huggingface_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + assert embedders.embedders["huggingface"].source == "huggingFace" + assert embedders.embedders["huggingface"].model == "BAAI/bge-base-en-v1.5" + assert embedders.embedders["huggingface"].dimensions == 768 + assert embedders.embedders["huggingface"].revision == "main" + assert embedders.embedders["huggingface"].document_template == "{{title}}" + assert embedders.embedders["huggingface"].document_template_max_bytes == 400 + assert embedders.embedders["huggingface"].distribution.mean == 0.5 + assert embedders.embedders["huggingface"].distribution.sigma == 0.1 + assert embedders.embedders["huggingface"].binary_quantized is False + + # Test Ollama embedder + ollama_embedder = { + "ollama": { + "source": "ollama", + "url": "http://localhost:11434/api/embeddings", + "api_key": "test-key", + "model": "llama2", + "dimensions": 4096, + "document_template": "{{title}}", + "document_template_max_bytes": 400, + "distribution": {"mean": 0.5, "sigma": 0.1}, + "binary_quantized": False, + } + } + response = index.update_embedders(ollama_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + assert embedders.embedders["ollama"].source == "ollama" + assert embedders.embedders["ollama"].url == "http://localhost:11434/api/embeddings" + assert embedders.embedders["ollama"].api_key == "test-key" + assert embedders.embedders["ollama"].model == "llama2" + assert embedders.embedders["ollama"].dimensions == 4096 + assert embedders.embedders["ollama"].document_template == "{{title}}" + assert embedders.embedders["ollama"].document_template_max_bytes == 400 + assert embedders.embedders["ollama"].distribution.mean == 0.5 + assert embedders.embedders["ollama"].distribution.sigma == 0.1 + assert embedders.embedders["ollama"].binary_quantized is False + + # Test Rest embedder + rest_embedder = { + "rest": { + "source": "rest", + "url": "http://localhost:8000/embed", + "api_key": "test-key", + "dimensions": 512, + "document_template": "{{title}}", + "document_template_max_bytes": 400, + "request": {"text": "{{title}}"}, + "response": {"embedding": "vector"}, + "headers": {"Authorization": "Bearer test-key"}, + "distribution": {"mean": 0.5, "sigma": 0.1}, + "binary_quantized": False, + } + } + response = index.update_embedders(rest_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + assert embedders.embedders["rest"].source == "rest" + assert embedders.embedders["rest"].url == "http://localhost:8000/embed" + assert embedders.embedders["rest"].api_key == "test-key" + assert embedders.embedders["rest"].dimensions == 512 + assert embedders.embedders["rest"].document_template == "{{title}}" + assert embedders.embedders["rest"].document_template_max_bytes == 400 + assert embedders.embedders["rest"].request == {"text": "{{title}}"} + assert embedders.embedders["rest"].response == {"embedding": "vector"} + assert embedders.embedders["rest"].headers == {"Authorization": "Bearer test-key"} + assert embedders.embedders["rest"].distribution.mean == 0.5 + assert embedders.embedders["rest"].distribution.sigma == 0.1 + assert embedders.embedders["rest"].binary_quantized is False + + # Test UserProvided embedder + user_provided_embedder = { + "user_provided": { + "source": "userProvided", + "dimensions": 512, + "distribution": {"mean": 0.5, "sigma": 0.1}, + "binary_quantized": False, + } + } + response = index.update_embedders(user_provided_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + assert embedders.embedders["user_provided"].source == "userProvided" + assert embedders.embedders["user_provided"].dimensions == 512 + assert embedders.embedders["user_provided"].distribution.mean == 0.5 + assert embedders.embedders["user_provided"].distribution.sigma == 0.1 + assert embedders.embedders["user_provided"].binary_quantized is False From b324323de5ba28c17f311bd9f9c2b782f35371b3 Mon Sep 17 00:00:00 2001 From: Strift Date: Wed, 26 Mar 2025 16:29:45 +0800 Subject: [PATCH 22/24] Add tests for fields presence --- tests/settings/test_settings_embedders.py | 49 +++++++++-------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index cb5d6c92..015ce40b 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -52,23 +52,21 @@ def test_embedder_format_fields(empty_index): openai_embedder = { "openai": { "source": "openAi", - "api_key": "test-key", + "apiKey": "test-key", "model": "text-embedding-3-small", "dimensions": 1536, - "document_template": "{{title}}", - "document_template_max_bytes": 400, + "documentTemplateMaxBytes": 400, "distribution": {"mean": 0.5, "sigma": 0.1}, - "binary_quantized": False, + "binaryQuantized": False, } } response = index.update_embedders(openai_embedder) index.wait_for_task(response.task_uid) embedders = index.get_embedders() assert embedders.embedders["openai"].source == "openAi" - assert embedders.embedders["openai"].api_key == "test-key" assert embedders.embedders["openai"].model == "text-embedding-3-small" assert embedders.embedders["openai"].dimensions == 1536 - assert embedders.embedders["openai"].document_template == "{{title}}" + assert hasattr(embedders.embedders["openai"], "document_template") assert embedders.embedders["openai"].document_template_max_bytes == 400 assert embedders.embedders["openai"].distribution.mean == 0.5 assert embedders.embedders["openai"].distribution.sigma == 0.1 @@ -79,12 +77,10 @@ def test_embedder_format_fields(empty_index): "huggingface": { "source": "huggingFace", "model": "BAAI/bge-base-en-v1.5", - "dimensions": 768, "revision": "main", - "document_template": "{{title}}", - "document_template_max_bytes": 400, + "documentTemplateMaxBytes": 400, "distribution": {"mean": 0.5, "sigma": 0.1}, - "binary_quantized": False, + "binaryQuantized": False, } } response = index.update_embedders(huggingface_embedder) @@ -92,9 +88,8 @@ def test_embedder_format_fields(empty_index): embedders = index.get_embedders() assert embedders.embedders["huggingface"].source == "huggingFace" assert embedders.embedders["huggingface"].model == "BAAI/bge-base-en-v1.5" - assert embedders.embedders["huggingface"].dimensions == 768 assert embedders.embedders["huggingface"].revision == "main" - assert embedders.embedders["huggingface"].document_template == "{{title}}" + assert hasattr(embedders.embedders["huggingface"], "document_template") assert embedders.embedders["huggingface"].document_template_max_bytes == 400 assert embedders.embedders["huggingface"].distribution.mean == 0.5 assert embedders.embedders["huggingface"].distribution.sigma == 0.1 @@ -105,13 +100,12 @@ def test_embedder_format_fields(empty_index): "ollama": { "source": "ollama", "url": "http://localhost:11434/api/embeddings", - "api_key": "test-key", + "apiKey": "test-key", "model": "llama2", "dimensions": 4096, - "document_template": "{{title}}", - "document_template_max_bytes": 400, + "documentTemplateMaxBytes": 400, "distribution": {"mean": 0.5, "sigma": 0.1}, - "binary_quantized": False, + "binaryQuantized": False, } } response = index.update_embedders(ollama_embedder) @@ -119,10 +113,9 @@ def test_embedder_format_fields(empty_index): embedders = index.get_embedders() assert embedders.embedders["ollama"].source == "ollama" assert embedders.embedders["ollama"].url == "http://localhost:11434/api/embeddings" - assert embedders.embedders["ollama"].api_key == "test-key" assert embedders.embedders["ollama"].model == "llama2" assert embedders.embedders["ollama"].dimensions == 4096 - assert embedders.embedders["ollama"].document_template == "{{title}}" + assert hasattr(embedders.embedders["ollama"], "document_template") assert embedders.embedders["ollama"].document_template_max_bytes == 400 assert embedders.embedders["ollama"].distribution.mean == 0.5 assert embedders.embedders["ollama"].distribution.sigma == 0.1 @@ -133,15 +126,14 @@ def test_embedder_format_fields(empty_index): "rest": { "source": "rest", "url": "http://localhost:8000/embed", - "api_key": "test-key", + "apiKey": "test-key", "dimensions": 512, - "document_template": "{{title}}", - "document_template_max_bytes": 400, - "request": {"text": "{{title}}"}, - "response": {"embedding": "vector"}, + "documentTemplateMaxBytes": 400, + "request": {"model": "MODEL_NAME", "input": "{{text}}"}, + "response": {"result": {"data": ["{{embedding}}"]}}, "headers": {"Authorization": "Bearer test-key"}, "distribution": {"mean": 0.5, "sigma": 0.1}, - "binary_quantized": False, + "binaryQuantized": False, } } response = index.update_embedders(rest_embedder) @@ -149,12 +141,11 @@ def test_embedder_format_fields(empty_index): embedders = index.get_embedders() assert embedders.embedders["rest"].source == "rest" assert embedders.embedders["rest"].url == "http://localhost:8000/embed" - assert embedders.embedders["rest"].api_key == "test-key" assert embedders.embedders["rest"].dimensions == 512 - assert embedders.embedders["rest"].document_template == "{{title}}" + assert hasattr(embedders.embedders["rest"], "document_template") assert embedders.embedders["rest"].document_template_max_bytes == 400 - assert embedders.embedders["rest"].request == {"text": "{{title}}"} - assert embedders.embedders["rest"].response == {"embedding": "vector"} + assert embedders.embedders["rest"].request == {"model": "MODEL_NAME", "input": "{{text}}"} + assert embedders.embedders["rest"].response == {"result": {"data": ["{{embedding}}"]}} assert embedders.embedders["rest"].headers == {"Authorization": "Bearer test-key"} assert embedders.embedders["rest"].distribution.mean == 0.5 assert embedders.embedders["rest"].distribution.sigma == 0.1 @@ -166,7 +157,7 @@ def test_embedder_format_fields(empty_index): "source": "userProvided", "dimensions": 512, "distribution": {"mean": 0.5, "sigma": 0.1}, - "binary_quantized": False, + "binaryQuantized": False, } } response = index.update_embedders(user_provided_embedder) From 057377bd693b150bf4f405e30a16da9bd4e5c892 Mon Sep 17 00:00:00 2001 From: Strift Date: Wed, 26 Mar 2025 16:50:39 +0800 Subject: [PATCH 23/24] Split tests --- tests/settings/test_settings_embedders.py | 29 +++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index 015ce40b..5baf2e09 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -44,11 +44,10 @@ def test_reset_embedders(new_embedders, empty_index): assert response_last is None -def test_embedder_format_fields(empty_index): - """Tests that each embedder type has the required fields and proper format.""" +def test_openai_embedder_format(empty_index): + """Tests that OpenAi embedder has the required fields and proper format.""" index = empty_index() - # Test OpenAi embedder openai_embedder = { "openai": { "source": "openAi", @@ -72,7 +71,11 @@ def test_embedder_format_fields(empty_index): assert embedders.embedders["openai"].distribution.sigma == 0.1 assert embedders.embedders["openai"].binary_quantized is False - # Test HuggingFace embedder + +def test_huggingface_embedder_format(empty_index): + """Tests that HuggingFace embedder has the required fields and proper format.""" + index = empty_index() + huggingface_embedder = { "huggingface": { "source": "huggingFace", @@ -95,7 +98,11 @@ def test_embedder_format_fields(empty_index): assert embedders.embedders["huggingface"].distribution.sigma == 0.1 assert embedders.embedders["huggingface"].binary_quantized is False - # Test Ollama embedder + +def test_ollama_embedder_format(empty_index): + """Tests that Ollama embedder has the required fields and proper format.""" + index = empty_index() + ollama_embedder = { "ollama": { "source": "ollama", @@ -121,7 +128,11 @@ def test_embedder_format_fields(empty_index): assert embedders.embedders["ollama"].distribution.sigma == 0.1 assert embedders.embedders["ollama"].binary_quantized is False - # Test Rest embedder + +def test_rest_embedder_format(empty_index): + """Tests that Rest embedder has the required fields and proper format.""" + index = empty_index() + rest_embedder = { "rest": { "source": "rest", @@ -151,7 +162,11 @@ def test_embedder_format_fields(empty_index): assert embedders.embedders["rest"].distribution.sigma == 0.1 assert embedders.embedders["rest"].binary_quantized is False - # Test UserProvided embedder + +def test_user_provided_embedder_format(empty_index): + """Tests that UserProvided embedder has the required fields and proper format.""" + index = empty_index() + user_provided_embedder = { "user_provided": { "source": "userProvided", From e515f297cf883d55d01c9b07b702ee6b2a9704bf Mon Sep 17 00:00:00 2001 From: Strift Date: Wed, 2 Apr 2025 16:28:32 +0800 Subject: [PATCH 24/24] Fix missing imports --- meilisearch/index.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index d69bcbaf..812c3a57 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -28,6 +28,8 @@ Embedders, EmbedderType, HuggingFaceEmbedder, + OllamaEmbedder, + OpenAiEmbedder, RestEmbedder, UserProvidedEmbedder, ) @@ -35,11 +37,8 @@ Faceting, IndexStats, LocalizedAttributes, - OllamaEmbedder, - OpenAiEmbedder, Pagination, ProximityPrecision, - RestEmbedder, TypoTolerance, ) from meilisearch.models.task import Task, TaskInfo, TaskResults @@ -1924,7 +1923,6 @@ def get_embedders(self) -> Embedders | None: if not response: return None - embedders: dict[str, EmbedderType] = {} for k, v in response.items(): source = v.get("source") @@ -1968,9 +1966,24 @@ def update_embedders(self, body: Union[MutableMapping[str, Any], None]) -> TaskI Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors """ if body is not None and body.get("embedders"): - for _, v in body["embedders"].items(): - if "documentTemplateMaxBytes" in v and v["documentTemplateMaxBytes"] is None: - del v["documentTemplateMaxBytes"] + embedders: dict[str, EmbedderType] = {} + for k, v in body["embedders"].items(): + source = v.get("source") + if source == "openAi": + embedders[k] = OpenAiEmbedder(**v) + elif source == "huggingFace": + embedders[k] = HuggingFaceEmbedder(**v) + elif source == "ollama": + embedders[k] = OllamaEmbedder(**v) + elif source == "rest": + embedders[k] = RestEmbedder(**v) + elif source == "userProvided": + embedders[k] = UserProvidedEmbedder(**v) + else: + # Default to UserProvidedEmbedder for unknown sources + embedders[k] = UserProvidedEmbedder(**v) + + body = {"embedders": {k: v.model_dump(by_alias=True) for k, v in embedders.items()}} task = self.http.patch(self.__settings_url_for(self.config.paths.embedders), body)