Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
- **Hybrid Search**: Combines vector similarity (cosine distance) with FTS5 full-text search for superior retrieval
- **Smart Chunking**: Markdown-aware parsing preserves semantic boundaries
- **Intelligent Sync**: Content-hash change detection skips unchanged files, atomically replaces modified ones, and cleans up deleted ones
- **Transactional Safety**: Every sync operation runs inside a SAVEPOINT transaction - either fully succeeds or fully rolls back, no partially-indexed content
- **Transactional Safety**: Text/file ingests run inside SAVEPOINT transactions, and directory sync uses transactional cleanup plus per-file transactional updates so failed files do not leave partial rows behind
- **Efficient Storage**: Binary embeddings with configurable dimensions
- **Embedding Cache**: Automatically caches computed embeddings, so re-indexing the same text skips redundant API calls and computation
- **Flexible Embedding**: Use local models (llama.cpp) or [vectors.space](https://vectors.space) remote API
Expand All @@ -61,6 +61,9 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to

## Getting Started

> [!IMPORTANT]
> Databases created with sqlite-memory versions earlier than `1.0.0` must be rebuilt before use with `1.0.0+`, because the internal schema changed.

### Prerequisites

- SQLite
Expand All @@ -74,7 +77,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
```sql
-- Load extensions (sync is optional)
.load ./vector
.load ./sync
.load ./cloudsync
.load ./memory

-- Configure embedding model (choose one):
Expand All @@ -84,8 +87,8 @@ SELECT memory_set_model('local', '/path/to/nomic-embed-text-v1.5.Q8_0.gguf');

-- Option 2: Remote embedding via vectors.space (requires free API key from https://vectors.space)
-- The provider name 'openai' selects the vectors.space OpenAI-compatible endpoint.
-- SELECT memory_set_model('openai', 'text-embedding-3-small');
-- SELECT memory_set_apikey('your-vectorspace-api-key');
-- SELECT memory_set_model('openai', 'text-embedding-3-small');

-- Add some knowledge
SELECT memory_add_text('SQLite is a C-language library that implements a small, fast,
Expand Down Expand Up @@ -160,7 +163,7 @@ All `memory_add_*` functions use content-hash change detection to avoid redundan
1. **Cleanup**: Removes database entries for files that no longer exist on disk
2. **Scan**: Recursively processes all matching files - adding new ones, replacing modified ones, and skipping unchanged ones

Every sync operation is wrapped in a SQLite SAVEPOINT transaction. If anything fails mid-sync (embedding error, disk issue, etc.), the entire operation rolls back cleanly. There is no risk of partially-indexed files or orphaned entries.
`memory_add_text()` and `memory_add_file()` each run inside a SQLite SAVEPOINT transaction. `memory_add_directory()` performs its cleanup pass transactionally and then processes each file in its own transaction. If one file fails, that file rolls back cleanly and previously-committed files remain valid; there are no partially-indexed rows or orphaned chunk/FTS entries for the failed file.

This makes all sync functions safe to call repeatedly - for example, on a cron schedule or at agent startup - with minimal overhead.

Expand Down Expand Up @@ -258,8 +261,8 @@ FROM dbmem_content;
-- Delete by context
SELECT memory_delete_context('old-project');

-- Delete specific memory
SELECT memory_delete(1234567890);
-- Delete specific memory by hash
SELECT memory_delete('9e3779b97f4a7c15');

-- Clear all memories
SELECT memory_clear();
Expand All @@ -279,8 +282,11 @@ cd sqlite-memory
# Build (full build with local + remote engines)
make

# Run tests
# Run parser/core unit tests + extension loading smoke test
make test

# Run the full SQL extension unit suite
make test DEFINES="-DTEST_SQLITE_EXTENSION"
```

### Build Configurations
Expand Down
1 change: 1 addition & 0 deletions src/dbmem-embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void dbmem_local_engine_free (dbmem_local_engine_t *engine);

dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider, const char *model, char err_msg[DBMEM_ERRBUF_SIZE]);
int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *text, int text_len, embedding_result_t *result);
int dbmem_remote_engine_set_apikey (dbmem_remote_engine_t *engine, const char *api_key, char err_msg[DBMEM_ERRBUF_SIZE]);
void dbmem_remote_engine_free (dbmem_remote_engine_t *engine);

// Custom provider (always available, defined in sqlite-memory.c)
Expand Down
14 changes: 10 additions & 4 deletions src/dbmem-lembed.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,15 @@ void dbmem_logger (enum ggml_log_level level, const char *text, void *user_data)

// MARK: -

static void dbmem_local_set_error(dbmem_local_engine_t *engine, const char *message) {
if (!engine || !engine->context) return;
dbmem_context_set_error(engine->context, message);
}

dbmem_local_engine_t *dbmem_local_engine_init (void *ctx, const char *model_path, char err_msg[DBMEM_ERRBUF_SIZE]) {
dbmem_local_engine_t *engine = (dbmem_local_engine_t *)dbmemory_zeroalloc(sizeof(dbmem_local_engine_t));
if (!engine) return NULL;
engine->context = (dbmem_context *)ctx;

// set logger
llama_log_set(dbmem_logger, engine);
Expand Down Expand Up @@ -212,7 +218,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
// Tokenize
int n_tokens = llama_tokenize(engine->vocab, text, text_len, engine->tokens, engine->tokens_capacity, true, true);
if (n_tokens < 0) {
dbmem_context_set_error(engine->context, "Tokenization failed (text too long?)");
dbmem_local_set_error(engine, "Tokenization failed (text too long?)");
return -1;
}

Expand Down Expand Up @@ -242,7 +248,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
// Encode
int ret = llama_encode(engine->ctx, batch);
if (ret != 0) {
dbmem_context_set_error(engine->context, "Llama_encode failed");
dbmem_local_set_error(engine, "Llama_encode failed");
return -1;
}

Expand All @@ -255,7 +261,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
}

if (!emb_ptr) {
dbmem_context_set_error(engine->context, "Failed to get embeddings");
dbmem_local_set_error(engine, "Failed to get embeddings");
return -1;
}

Expand Down Expand Up @@ -301,5 +307,5 @@ void dbmem_local_engine_free (dbmem_local_engine_t *engine) {
}

llama_backend_free();
dbmemory_free(engine);
}

6 changes: 3 additions & 3 deletions src/dbmem-parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
typedef struct {
size_t start; // Byte offset in source buffer
size_t end; // Byte end in source buffer
int is_heading; // True if this section starts with a heading block
char *text; // Stripped plain text (allocated)
size_t text_len; // Length of stripped text
} section_t;
Expand Down Expand Up @@ -113,8 +114,6 @@ static size_t find_split (const char *text, size_t len, size_t max_chars) {

// Push a section to dynamic array
static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_heading) {
UNUSED_PARAM(is_heading);

if (ctx->sec_count >= ctx->sec_cap) {
size_t new_cap = ctx->sec_cap ? ctx->sec_cap * 2 : 16;
section_t *tmp = (section_t *)dbmemory_realloc(ctx->sections, new_cap * sizeof(section_t));
Expand All @@ -126,6 +125,7 @@ static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_head
section_t *s = &ctx->sections[ctx->sec_count++];
s->start = start;
s->end = end;
s->is_heading = is_heading;
s->text = NULL;
s->text_len = 0;

Expand Down Expand Up @@ -607,7 +607,7 @@ static int parse_sections (const char *buffer, size_t buffer_size, bool skip_sem
for (size_t i = 0; i < ctx->sec_count; i++) {
section_t *s = &ctx->sections[i];
// First section or heading starts new section
if (write_idx == 0) {
if (write_idx == 0 || s->is_heading) {
ctx->sections[write_idx++] = *s;
} else {
// Extend previous section to include this one
Expand Down
58 changes: 53 additions & 5 deletions src/dbmem-rembed.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static size_t cacert_len = sizeof(cacert_pem) - 1;

#ifndef DBMEM_OMIT_CURL
static size_t dbmem_remote_receive_data(void *contents, size_t size, size_t nmemb, void *xdata);
static struct curl_slist *dbmem_remote_build_headers (const char *api_key);
#endif

struct dbmem_remote_engine_t {
Expand Down Expand Up @@ -67,6 +68,27 @@ struct dbmem_remote_engine_t {
#include <stdbool.h>
#include <stddef.h>

#ifndef DBMEM_OMIT_CURL
static struct curl_slist *dbmem_remote_build_headers (const char *api_key) {
char auth_header[512];
struct curl_slist *headers = NULL;
struct curl_slist *next = NULL;

snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", api_key);
headers = curl_slist_append(headers, auth_header);
if (!headers) return NULL;

next = curl_slist_append(headers, "Content-Type: application/json");
if (!next) {
curl_slist_free_all(headers);
return NULL;
}
headers = next;

return headers;
}
#endif

static bool text_needs_json_escape (const char *text, size_t *len) {
size_t original_len = *len;
size_t required_len = 0;
Expand Down Expand Up @@ -263,11 +285,7 @@ dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider
#endif

// set up headers
char auth_header[512];
snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", api_key);
struct curl_slist *headers = NULL;
headers = curl_slist_append(headers, auth_header);
if (headers) headers = curl_slist_append(headers, "Content-Type: application/json");
struct curl_slist *headers = dbmem_remote_build_headers(api_key);
if (!headers) {
snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Failed to allocate HTTP headers");
curl_easy_cleanup(curl);
Expand Down Expand Up @@ -522,6 +540,36 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t
return 0;
}

int dbmem_remote_engine_set_apikey (dbmem_remote_engine_t *engine, const char *api_key, char err_msg[DBMEM_ERRBUF_SIZE]) {
if (!engine || !api_key) {
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Invalid remote engine or API key");
return SQLITE_MISUSE;
}

#ifndef DBMEM_OMIT_CURL
struct curl_slist *headers = dbmem_remote_build_headers(api_key);
if (!headers) {
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Failed to allocate HTTP headers");
return SQLITE_NOMEM;
}

curl_easy_setopt(engine->curl, CURLOPT_HTTPHEADER, headers);
if (engine->headers) curl_slist_free_all(engine->headers);
engine->headers = headers;
#else
char *copy = dbmem_strdup(api_key);
if (!copy) {
if (err_msg) snprintf(err_msg, DBMEM_ERRBUF_SIZE, "Unable to duplicate API key (insufficient memory)");
return SQLITE_NOMEM;
}

if (engine->api_key) dbmemory_free(engine->api_key);
engine->api_key = copy;
#endif

return SQLITE_OK;
}

void dbmem_remote_engine_free (dbmem_remote_engine_t *engine) {
if (!engine) return;

Expand Down
Loading
Loading