diff --git a/.gitignore b/.gitignore index 0268d5d3..7a477b03 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ sift/ *.bin *.out venv/ +.venv vendor/ dist/ diff --git a/Makefile b/Makefile index 43e1d980..76121720 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,9 @@ endif ifdef CONFIG_DARWIN LOADABLE_EXTENSION=dylib +# Let unresolved SQLite symbols resolve against host at load time +# This is standard for SQLite loadable extensions on macOS. +CFLAGS += -undefined dynamic_lookup endif ifdef CONFIG_LINUX @@ -193,6 +196,10 @@ test-loadable: loadable test-loadable-snapshot-update: loadable $(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update +# Update snapshots for all loadable tests (use after intentional behavior changes) +test-snapshots-update: loadable + $(PYTHON) -m pytest -vv tests/test-*.py --snapshot-update + test-loadable-watch: watchexec --exts c,py,Makefile --clear -- make test-loadable diff --git a/sqlite-vec.c b/sqlite-vec.c index 3cc802f0..4d36bb83 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -8434,6 +8434,101 @@ int vec0Update_Delete_DeleteRowids(vec0_vtab *p, i64 rowid) { return rc; } +// Clear the rowid slot in v_chunks.rowids for the given chunk/offset +int vec0Update_Delete_ClearRowid(vec0_vtab *p, i64 chunk_id, i64 chunk_offset) { + int rc; + sqlite3_blob *blobChunksRowids = NULL; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", + chunk_id, 1, &blobChunksRowids); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "could not open rowids blob for %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return SQLITE_ERROR; + } + + i64 expected = p->chunk_size * sizeof(i64); + i64 actual = sqlite3_blob_bytes(blobChunksRowids); + if (expected != actual) { + vtab_set_error(&p->base, + VEC_INTERAL_ERROR + "rowids blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld", + p->schemaName, p->shadowChunksName, chunk_id, expected, actual); + sqlite3_blob_close(blobChunksRowids); + return SQLITE_ERROR; + } + + i64 zero = 0; + rc = sqlite3_blob_write(blobChunksRowids, &zero, sizeof(i64), + chunk_offset * sizeof(i64)); + int brc = sqlite3_blob_close(blobChunksRowids); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "could not write rowids blob on %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return rc; + } + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "could not close rowids blob on %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return brc; + } + return SQLITE_OK; +} + +// Clear the vector bytes for each vector column at the given chunk/offset +int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, i64 chunk_offset) { + for (int i = 0; i < p->numVectorColumns; i++) { + int rc; + sqlite3_blob *blobVectors = NULL; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], + "vectors", chunk_id, 1, &blobVectors); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Could not open vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return rc; + } + + i64 expected = p->chunk_size * vector_column_byte_size(p->vector_columns[i]); + i64 actual = sqlite3_blob_bytes(blobVectors); + if (expected != actual) { + vtab_set_error(&p->base, + VEC_INTERAL_ERROR + "vector blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id, expected, actual); + sqlite3_blob_close(blobVectors); + return SQLITE_ERROR; + } + + size_t nbytes = vector_column_byte_size(p->vector_columns[i]); + void *zeros = sqlite3_malloc(nbytes); + if (!zeros) { + sqlite3_blob_close(blobVectors); + return SQLITE_NOMEM; + } + memset(zeros, 0, nbytes); + rc = vec0_write_vector_to_vector_blob(blobVectors, chunk_offset, zeros, + p->vector_columns[i].dimensions, + p->vector_columns[i].element_type); + sqlite3_free(zeros); + + int brc = sqlite3_blob_close(blobVectors); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Could not write to vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return rc; + } + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "Could not commit blob transaction for vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return brc; + } + } + return SQLITE_OK; +} + int vec0Update_Delete_DeleteAux(vec0_vtab *p, i64 rowid) { int rc; sqlite3_stmt *stmt = NULL; @@ -8574,9 +8669,17 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { // 3. zero out rowid in chunks.rowids // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 4. zero out any data in vector chunks tables // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 5. delete from _rowids table rc = vec0Update_Delete_DeleteRowids(p, rowid); diff --git a/tests/__snapshots__/test-auxiliary.ambr b/tests/__snapshots__/test-auxiliary.ambr index bfe3d2c9..66a3ef3a 100644 --- a/tests/__snapshots__/test-auxiliary.ambr +++ b/tests/__snapshots__/test-auxiliary.ambr @@ -137,7 +137,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -163,7 +163,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), diff --git a/tests/__snapshots__/test-general.ambr b/tests/__snapshots__/test-general.ambr index 0eac460f..ddae7aa0 100644 --- a/tests/__snapshots__/test-general.ambr +++ b/tests/__snapshots__/test-general.ambr @@ -126,7 +126,7 @@ 'rows': list([ OrderedDict({ 'schema': 'main', - 'name': 'v_auxiliary', + 'name': 'v_metadatatext00', 'type': 'shadow', 'ncol': 2, 'wr': 0, @@ -134,39 +134,39 @@ }), OrderedDict({ 'schema': 'main', - 'name': 'v_chunks', + 'name': 'v_metadatachunks00', 'type': 'shadow', - 'ncol': 6, + 'ncol': 2, 'wr': 0, 'strict': 0, }), OrderedDict({ 'schema': 'main', - 'name': 'v_info', + 'name': 'v_rowids', 'type': 'shadow', - 'ncol': 2, + 'ncol': 4, 'wr': 0, 'strict': 0, }), OrderedDict({ 'schema': 'main', - 'name': 'v_rowids', + 'name': 'v_auxiliary', 'type': 'shadow', - 'ncol': 4, + 'ncol': 2, 'wr': 0, 'strict': 0, }), OrderedDict({ 'schema': 'main', - 'name': 'v_metadatachunks00', + 'name': 'v_chunks', 'type': 'shadow', - 'ncol': 2, + 'ncol': 6, 'wr': 0, 'strict': 0, }), OrderedDict({ 'schema': 'main', - 'name': 'v_metadatatext00', + 'name': 'v_info', 'type': 'shadow', 'ncol': 2, 'wr': 0, diff --git a/tests/__snapshots__/test-metadata.ambr b/tests/__snapshots__/test-metadata.ambr index 12212ff0..e5ffaf28 100644 --- a/tests/__snapshots__/test-metadata.ambr +++ b/tests/__snapshots__/test-metadata.ambr @@ -28,7 +28,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x02', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -89,7 +89,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -264,7 +264,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -335,7 +335,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), diff --git a/tests/test-delete-clears-bytes.py b/tests/test-delete-clears-bytes.py new file mode 100644 index 00000000..41ff37f9 --- /dev/null +++ b/tests/test-delete-clears-bytes.py @@ -0,0 +1,106 @@ +import os + + +def test_delete_clears_rowid_and_vectors(): + try: + import pysqlite3 as sqlite3 # uses bundled modern SQLite with extension loading + except ImportError: # fallback if not available + import sqlite3 + + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + if hasattr(db, "enable_load_extension"): + db.enable_load_extension(True) + ext = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dist", "vec0")) + try: + # Explicit entrypoint to avoid relying on default name + db.load_extension(ext, "sqlite3_vec_init") + except Exception: + # Some loaders accept missing suffix path without explicit entrypoint + db.load_extension(ext) + + # One vector column with 1 dimension (4 bytes per vector), chunk_size=8 + db.execute("create virtual table v using vec0(vector float[1], chunk_size=8)") + + # Insert two rows with distinct raw vector bytes + db.execute( + "insert into v(rowid, vector) values (?, ?)", + [1, b"\x11\x11\x11\x11"], + ) + db.execute( + "insert into v(rowid, vector) values (?, ?)", + [2, b"\x22\x22\x22\x22"], + ) + + # Sanity check pre-delete: validity has first two bits set (0b00000011) + row = db.execute("select validity, rowids from v_chunks").fetchone() + assert row is not None + assert row[0] == b"\x03" + + # Delete rowid=1 + db.execute("delete from v where rowid = 1") + + # After delete, validity should only have bit 1 set (0b00000010) + row = db.execute("select validity, rowids from v_chunks").fetchone() + assert row[0] == b"\x02" + + # Rowids BLOB: first 8 bytes (slot 0) must be zero; second (slot 1) must be rowid=2 + rowids = row[1] + assert isinstance(rowids, (bytes, bytearray)) + assert len(rowids) == 8 * 8 # chunk_size * sizeof(i64) + assert rowids[0:8] == b"\x00" * 8 + assert rowids[8:16] == b"\x02\x00\x00\x00\x00\x00\x00\x00" + + # Vectors BLOB for the first (and only) vector column + vectors_row = db.execute("select vectors from v_vector_chunks00").fetchone() + vectors = vectors_row[0] + # chunk_size (8) * 4 bytes per float32 = 32 bytes + assert len(vectors) == 32 + # Slot 0 cleared to zeros, slot 1 left as inserted (0x22 0x22 0x22 0x22) + assert vectors[0:4] == b"\x00\x00\x00\x00" + assert vectors[4:8] == b"\x22\x22\x22\x22" + + +def test_vacuum_shrinks_file(tmp_path): + try: + import pysqlite3 as sqlite3 + except ImportError: + import sqlite3 + + db_path = tmp_path / "vacuum_vec.db" + + con = sqlite3.connect(str(db_path)) + con.row_factory = sqlite3.Row + if hasattr(con, "enable_load_extension"): + con.enable_load_extension(True) + ext = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dist", "vec0")) + try: + con.load_extension(ext) + except Exception: + # Some platforms require the full filename or default entrypoint; fallback already tried + con.load_extension(ext) + + # Use a larger chunk_size to inflate file size more clearly + con.execute("create virtual table v using vec0(vector float[1], chunk_size=4096)") + + # Insert a decent number of rows to grow the DB + N = 10000 + con.executemany( + "insert into v(rowid, vector) values(?, ?)", + ((i, b"\x11\x11\x11\x11") for i in range(1, N + 1)), + ) + con.commit() + + size_after_insert = os.stat(db_path).st_size + assert size_after_insert > 0 + + # Drop the table to free its pages, then VACUUM to rewrite/shrink the file + con.execute("drop table v") + con.commit() + con.execute("VACUUM") + con.close() + + size_after_vacuum = os.stat(db_path).st_size + + # File should shrink after dropping the table and VACUUM + assert size_after_vacuum < size_after_insert