diff --git a/README.md b/README.md index 4850822..845a305 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ A Dissect module implementing parsers for various database formats, including: - Berkeley DB, used for example in older RPM databases -- Microsofts Extensible Storage Engine (ESE), used for example in Active Directory, Exchange and Windows Update +- Microsoft's Extensible Storage Engine (ESE), used for example in Active Directory, Exchange and Windows Update +- Google's LevelDB, used by browsers to store LocalStorage, SessionStorage and serialized IndexedDB databases - SQLite3, commonly used by applications to store configuration data For more information, please see [the documentation](https://docs.dissect.tools/en/latest/projects/dissect.database/index.html). diff --git a/dissect/database/__init__.py b/dissect/database/__init__.py index a97bc2f..a1d2174 100644 --- a/dissect/database/__init__.py +++ b/dissect/database/__init__.py @@ -1,13 +1,21 @@ from __future__ import annotations from dissect.database.bsd.db import DB +from dissect.database.chromium.localstorage.localstorage import LocalStorage +from dissect.database.chromium.sessionstorage.sessionstorage import SessionStorage from dissect.database.ese.ese import ESE from dissect.database.exception import Error +from dissect.database.indexeddb.indexeddb import IndexedDB +from dissect.database.leveldb.leveldb import LevelDB from dissect.database.sqlite3.sqlite3 import SQLite3 __all__ = [ "DB", "ESE", "Error", + "IndexedDB", + "LevelDB", + "LocalStorage", "SQLite3", + "SessionStorage", ] diff --git a/dissect/database/chromium/__init__.py b/dissect/database/chromium/__init__.py new file mode 100644 index 0000000..3e47785 --- /dev/null +++ b/dissect/database/chromium/__init__.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from dissect.database.chromium.localstorage.localstorage import LocalStorage +from dissect.database.chromium.sessionstorage.sessionstorage import SessionStorage + +__all__ = [ + "LocalStorage", + "SessionStorage", +] diff --git a/dissect/database/chromium/localstorage/__init__.py b/dissect/database/chromium/localstorage/__init__.py new file mode 100644 index 0000000..57a08f1 --- /dev/null +++ b/dissect/database/chromium/localstorage/__init__.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from dissect.database.chromium.localstorage.c_localstorage import c_localstorage +from dissect.database.chromium.localstorage.localstorage import Key, LocalStorage, MetaKey, Store + +__all__ = [ + "Key", + "LocalStorage", + "MetaKey", + "Store", + "c_localstorage", +] diff --git a/dissect/database/chromium/localstorage/c_localstorage.py b/dissect/database/chromium/localstorage/c_localstorage.py new file mode 100644 index 0000000..4b7051b --- /dev/null +++ b/dissect/database/chromium/localstorage/c_localstorage.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +from dissect.database.util.protobuf import ProtobufVarint, ProtobufVarint32 + +# References: +# - https://github.com/chromium/chromium/blob/main/components/services/storage/dom_storage/local_storage_database.proto +localstorage_def = """ +struct LocalStorageAreaWriteMetaData { + uint8 lm_type; + varint last_modified; + + uint8 sb_type; + varint size_bytes; +}; + +struct LocalStorageAreaAccessMetaData { + uint8 la_type; + varint last_accessed; +}; +""" + +c_localstorage = cstruct() +c_localstorage.add_custom_type("varint", ProtobufVarint, size=None, alignment=1, signed=False) +c_localstorage.add_custom_type("varint64", ProtobufVarint, size=None, alignment=1, signed=False) +c_localstorage.add_custom_type("varint32", ProtobufVarint32, size=None, alignment=1, signed=False) +c_localstorage.load(localstorage_def) diff --git a/dissect/database/chromium/localstorage/localstorage.py b/dissect/database/chromium/localstorage/localstorage.py new file mode 100644 index 0000000..b3d5b45 --- /dev/null +++ b/dissect/database/chromium/localstorage/localstorage.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +from functools import cached_property +from typing import TYPE_CHECKING + +from dissect.util.ts import webkittimestamp + +from dissect.database.chromium.localstorage import c_localstorage +from dissect.database.leveldb.c_leveldb import c_leveldb +from dissect.database.leveldb.leveldb import LevelDB + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + + +class LocalStorage: + """Google LocalStorage implementation. + + References: + - https://www.cclsolutionsgroup.com/post/chromium-session-storage-and-local-storage + """ + + stores: list[Store] + + def __init__(self, path: Path): + if not path.exists(): + raise FileNotFoundError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise NotADirectoryError(f"Provided path is not a directory: {path!r}") + + self._leveldb = LevelDB(path) + + self.path = path + self.stores = list(self._get_stores()) + + def __repr__(self) -> str: + return f"" + + def _get_stores(self) -> Iterator[Store]: + """Iterate over LevelDB records for store meta information.""" + + meta_keys = {} + + for record in self._leveldb.records: + if record.state == c_leveldb.RecordState.LIVE and ( + record.key[0:5] == b"META:" or record.key[0:11] == b"METAACCESS:" + ): + cls = MetaKey if record.key[0:5] == b"META:" else MetaAccessKey + meta_key = cls(record.key, record.value, record.state, record.sequence) + meta_keys.setdefault(meta_key.key, []) + meta_keys[meta_key.key].append(meta_key) + + for meta in meta_keys.values(): + yield Store(self, meta) + + def store(self, key: str) -> Store | None: + """Get a single store by host name.""" + for store in self.stores: + if store.host == key: + return store + return None + + +class Store: + """Represents a single store of keys.""" + + host: str + records: list[Key] + meta: list[MetaKey] + + def __init__(self, local_storage: LocalStorage, meta: list[MetaKey]): + self._local_storage = local_storage + self._records: list[Key] = [] + + self.host = meta[0].key + self.meta = sorted(meta, key=lambda m: m.sequence) + + def __repr__(self) -> str: + return f"" + + @property + def records(self) -> Iterator[RecordKey]: + """Yield all records related to this store.""" + + if self._records: + yield from self._records + + # e.g. with "_https://google.com\x00\x01MyKey", the prefix would be "_https://google.com\x00" + prefix = RecordKey.prefix + self.host.encode("iso-8859-1") + b"\x00" + prefix_len = len(prefix) + + for record in self._local_storage._leveldb.records: + if record.key[:prefix_len] == prefix: + key = RecordKey(self, record.key, record.value, record.state, record.sequence) + self._records.append(key) + yield key + + def get(self, key: str) -> RecordKey | None: + """Get a single :class:`RecordKey` by the given string identifier.""" + for record in self.records: + if record.key == key: + return record + return None + + +class Key: + """Abstract LocalStorage key class.""" + + prefix: bytes + state: c_leveldb.RecordState + sequence: int + key: str + value: str + + def __init__(self, raw_key: bytes, raw_value: bytes, state: c_leveldb.RecordState, sequence: int): + self._raw_key = raw_key + self._raw_value = raw_value + + self.state = state + self.sequence = sequence + + if not raw_key.startswith(self.prefix): + raise ValueError( + f"Invalid key prefix {raw_key[: len(self.prefix)]!r} for {self.__class__.__name__}: expected {self.prefix!r}" # noqa: E501 + ) + + self._decode_key() + self._decode_value() + + def __repr__(self): + return f"<{self.__class__.__name__} state={self.state!r} sequence={self.sequence!r} key={self.key!r} value={self.value!r}>" # noqa: E501 + + def _decode_key(self) -> None: + raise NotImplementedError + + def _decode_value(self) -> None: + raise NotImplementedError + + +class MetaKey(Key): + """Represents a LocalStorage meta key.""" + + prefix: bytes = b"META:" + value: c_localstorage.LocalStorageAreaWriteMetaData + + def _decode_key(self) -> None: + self.key = self._raw_key.removeprefix(self.prefix).decode("iso-8859-1") + + def _decode_value(self) -> None: + self.value = c_localstorage.LocalStorageAreaWriteMetaData(self._raw_value) + + +class MetaAccessKey(MetaKey): + """Represents a LocalStorage meta access key. + + References: + - https://chromium-review.googlesource.com/c/chromium/src/+/5585301 + """ + + prefix: bytes = b"METAACCESS:" + value: c_localstorage.LocalStorageAreaAccessMetaData + + def _decode_value(self) -> None: + self.value = c_localstorage.LocalStorageAreaAccessMetaData(self._raw_value) + + +class RecordKey(Key): + """Represents a LocalStorage record key.""" + + prefix: bytes = b"_" + + def __init__(self, store: Store, raw_key: bytes, raw_value: bytes, state: c_leveldb.RecordState, sequence: int): + super().__init__(raw_key, raw_value, state, sequence) + self.store = store + + @cached_property + def meta(self) -> dict: + """Calculate the metadata that likely belongs to this key. + + In a batch write action, meta keys are written first, followed by the records belonging to that batch. + To identify a candidate meta key for this record key, we iterate over the meta keys for the store that + this record key belongs to and choose the meta key(s) with the closest sequence number that is lower than + the record key sequence number. This introduces a possible inaccuracy for the matched timestamp(s). + + The accuracy of these timestamps should be taken with a grain of salt when interpreting them. A latency of + 5 to 60 seconds was observed between a script requesting a write and the key data ending up on disk. The + latency depends on several factors, such as how many write actions are happening at the time of write and + the amount of writes per host (website) happening (this is limited to 60 per hour). + + The reader (you!) is invited to invent a smarter method to efficiently find metadata belonging to a record key. + + References: + - local_storage_impl.cc + """ + meta = {"created": None, "last_modified": None, "last_accessed": None} + + for meta_key in self.store.meta: + if meta_key.sequence < self.sequence: + if hasattr(meta_key.value, "last_modified"): + meta["last_modified"] = webkittimestamp(meta_key.value.last_modified) + if not meta["created"]: + meta["created"] = meta["last_modified"] + if hasattr(meta_key.value, "last_accessed"): + meta["last_accessed"] = webkittimestamp(meta_key.value.last_accessed) + if not meta["created"] or meta["created"] > meta["last_accessed"]: + meta["created"] = meta["last_accessed"] + + elif meta_key.sequence > self.sequence: + break + + return meta + + def _decode_key(self) -> None: + _, _, buf = self._raw_key.removeprefix(self.prefix).partition(b"\x00") + + if buf[0] == 0x00: + self.key = buf[1:].decode("utf-16-le") + + if buf[0] == 0x01: + self.key = buf[1:].decode("iso-8859-1") + + def _decode_value(self) -> None: + buf = self._raw_value + + if not buf: + self.value = None + return + + if buf[0] == 0x00: + self.value = buf[1:].decode("utf-16-le") + + if buf[0] == 0x01: + self.value = buf[1:].decode("iso-8859-1") diff --git a/dissect/database/chromium/sessionstorage/__init__.py b/dissect/database/chromium/sessionstorage/__init__.py new file mode 100644 index 0000000..027f521 --- /dev/null +++ b/dissect/database/chromium/sessionstorage/__init__.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from dissect.database.chromium.sessionstorage.sessionstorage import Namespace, Record, SessionStorage + +__all__ = [ + "Namespace", + "Record", + "SessionStorage", +] diff --git a/dissect/database/chromium/sessionstorage/sessionstorage.py b/dissect/database/chromium/sessionstorage/sessionstorage.py new file mode 100644 index 0000000..4026771 --- /dev/null +++ b/dissect/database/chromium/sessionstorage/sessionstorage.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from dissect.database.leveldb.leveldb import LevelDB +from dissect.database.leveldb.leveldb import Record as LevelDBRecord + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + + +class SessionStorage: + """Google SessionStorage implementation. + + References: + - https://www.cclsolutionsgroup.com/post/chromium-session-storage-and-local-storage + """ + + namespaces: list[Namespace] + + def __init__(self, path: Path): + self.path = path + self._leveldb = LevelDB(path) + self.namespaces = list(self._get_namespaces()) + + def __repr__(self): + return f"" + + def _get_namespaces(self) -> Iterator[Namespace]: + for record in self._leveldb.records: + if record.key[0:10] == b"namespace-" and len(record.key) > 10 and record.value: + yield Namespace(self, record) + + def namespace(self, key: int | str) -> Iterator[Namespace] | None: + """Yield namespaces by the given id or hostname.""" + for namespace in self.namespaces: + if namespace.id == key or namespace.host == key: + yield namespace + + +class Namespace: + """Represents a single Session Storage namespace.""" + + uuid: str + id: int + host: str + + def __init__(self, session_storage: SessionStorage, record: LevelDBRecord): + self._session_storage = session_storage + self._record = record + + if not record.value: + raise ValueError(f"Namespace record does not have a value: {record!r}") + + _, self.uuid, self.host = record.key.decode().split("-", 2) + + self.id = int(record.value.decode()) + + def __repr__(self): + return f"" + + @property + def records(self) -> Iterator[Record]: + prefix = b"map-" + str(self.id).encode() + b"-" + for record in self._session_storage._leveldb.records: + if record.key[0 : len(prefix)] == prefix: + yield Record(self, record, prefix) + + +class Record: + """Represents a single Session Storage key and value pair.""" + + namespace: Namespace + + key: str + value: str + + def __init__(self, namespace: Namespace, record: LevelDBRecord, prefix: bytes): + self._namespace = namespace + self._record = record + + self.key = record.key.removeprefix(prefix).decode() + self.value = record.value.decode("utf-16-le") + + def __repr__(self): + return f"" diff --git a/dissect/database/indexeddb/__init__.py b/dissect/database/indexeddb/__init__.py new file mode 100644 index 0000000..4faaf9a --- /dev/null +++ b/dissect/database/indexeddb/__init__.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from dissect.database.indexeddb.c_indexeddb import c_indexeddb +from dissect.database.indexeddb.indexeddb import IndexedDB + +__all__ = [ + "IndexedDB", + "c_indexeddb", +] diff --git a/dissect/database/indexeddb/c_indexeddb.py b/dissect/database/indexeddb/c_indexeddb.py new file mode 100644 index 0000000..4798452 --- /dev/null +++ b/dissect/database/indexeddb/c_indexeddb.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +from dissect.database.util.protobuf import ProtobufVarint, ProtobufVarint32 + +indexeddb_def = """ +enum KeyPrefixType : uint8 { + GLOBAL_METADATA = 0, + DATABASE_METADATA = 1, + OBJECT_STORE_DATA = 2, + EXISTS_ENTRY = 3, + INDEX_DATA = 4, + INVALID_TYPE = 5, + BLOB_ENTRY = 6, +}; + +enum GlobalMetaDataType : uint8 { + SchemaVersionKey = 0, + MaxDatabaseIdKey = 1, + DataVersionKey = 2, + RecoveryBlobJournalKey = 3, + ActiveBlobJournalKey = 4, + EarliestSweepKey = 5, + EarliestCompactionKey = 6, + DatabaseFreeListKey = 100, + DatabaseNameKey = 201, +}; + +enum DatabaseMetaDataType { + ORIGIN_NAME = 0, + DATABASE_NAME = 1, + USER_STRING_VERSION = 2, // Obsolete + MAX_OBJECT_STORE_ID = 3, + USER_VERSION = 4, + BLOB_KEY_GENERATOR_CURRENT_NUMBER = 5, + MAX_SIMPLE_METADATA_TYPE = 6, + + ObjectStoreMetaData = 50, +}; + +enum IndexIdType { + ObjectStoreData = 1, + ExistsEntry = 2, + BlobEntry = 3, +}; + +#define kMaximumDepth 2000 +#define kMaximumArraySize 1000000 + +enum IdbKeyType { + Null = 0, + String = 1, + Date = 2, + Number = 3, + Array = 4, + MinKey = 5, + Binary = 6, +}; + +struct KeyPrefix { + uint8 lengths; + char database_id[((lengths >> 5) & 0x07) + 1]; + char object_store_id[((lengths >> 2) & 0x07) + 1]; + char index_id[(lengths & 0x03) + 1]; +}; + +#define kMinWireFormatVersion 21 + +struct IdbValueHeader { + varint version; + uint8 blink_tag; // 0xff + varint blink_version; +}; + +struct IdbValueBlob { + varint size; + varint index; +}; +""" + +c_indexeddb = cstruct() +c_indexeddb.add_custom_type("varint", ProtobufVarint, size=None, alignment=1, signed=False) +c_indexeddb.add_custom_type("varint64", ProtobufVarint, size=None, alignment=1, signed=False) +c_indexeddb.add_custom_type("varint32", ProtobufVarint32, size=None, alignment=1, signed=False) +c_indexeddb.load(indexeddb_def) diff --git a/dissect/database/indexeddb/c_indexeddb.pyi b/dissect/database/indexeddb/c_indexeddb.pyi new file mode 100644 index 0000000..81d1c61 --- /dev/null +++ b/dissect/database/indexeddb/c_indexeddb.pyi @@ -0,0 +1,97 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_indexeddb(__cs__.cstruct): + kMaximumDepth: Literal[2000] = ... + kMaximumArraySize: Literal[1000000] = ... + kMinWireFormatVersion: Literal[21] = ... + class varint(__cs__.ProtobufVarint): ... + class varint64(__cs__.ProtobufVarint): ... + class varint32(__cs__.ProtobufVarint32): ... + + class KeyPrefixType(__cs__.Enum): + GLOBAL_METADATA = ... + DATABASE_METADATA = ... + OBJECT_STORE_DATA = ... + EXISTS_ENTRY = ... + INDEX_DATA = ... + INVALID_TYPE = ... + BLOB_ENTRY = ... + + class GlobalMetaDataType(__cs__.Enum): + SchemaVersionKey = ... + MaxDatabaseIdKey = ... + DataVersionKey = ... + RecoveryBlobJournalKey = ... + ActiveBlobJournalKey = ... + EarliestSweepKey = ... + EarliestCompactionKey = ... + DatabaseFreeListKey = ... + DatabaseNameKey = ... + + class DatabaseMetaDataType(__cs__.Enum): + ORIGIN_NAME = ... + DATABASE_NAME = ... + USER_STRING_VERSION = ... + MAX_OBJECT_STORE_ID = ... + USER_VERSION = ... + BLOB_KEY_GENERATOR_CURRENT_NUMBER = ... + MAX_SIMPLE_METADATA_TYPE = ... + ObjectStoreMetaData = ... + + class IndexIdType(__cs__.Enum): + ObjectStoreData = ... + ExistsEntry = ... + BlobEntry = ... + + class IdbKeyType(__cs__.Enum): + Null = ... + String = ... + Date = ... + Number = ... + Array = ... + MinKey = ... + Binary = ... + + class KeyPrefix(__cs__.Structure): + lengths: _c_indexeddb.uint8 + database_id: __cs__.CharArray + object_store_id: __cs__.CharArray + index_id: __cs__.CharArray + @overload + def __init__( + self, + lengths: _c_indexeddb.uint8 | None = ..., + database_id: __cs__.CharArray | None = ..., + object_store_id: __cs__.CharArray | None = ..., + index_id: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IdbValueHeader(__cs__.Structure): + version: _c_indexeddb.varint + blink_tag: _c_indexeddb.uint8 + blink_version: _c_indexeddb.varint + @overload + def __init__( + self, + version: _c_indexeddb.varint | None = ..., + blink_tag: _c_indexeddb.uint8 | None = ..., + blink_version: _c_indexeddb.varint | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IdbValueBlob(__cs__.Structure): + size: _c_indexeddb.varint + index: _c_indexeddb.varint + @overload + def __init__(self, size: _c_indexeddb.varint | None = ..., index: _c_indexeddb.varint | None = ...): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_indexeddb` is an instance of `_c_indexeddb`, but then we can't use it in type hints +c_indexeddb: TypeAlias = _c_indexeddb diff --git a/dissect/database/indexeddb/indexeddb.py b/dissect/database/indexeddb/indexeddb.py new file mode 100644 index 0000000..f83e08e --- /dev/null +++ b/dissect/database/indexeddb/indexeddb.py @@ -0,0 +1,350 @@ +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from io import BytesIO +from typing import TYPE_CHECKING, Any, BinaryIO + +from dissect.database.indexeddb.c_indexeddb import c_indexeddb +from dissect.database.leveldb.c_leveldb import c_leveldb +from dissect.database.leveldb.leveldb import LevelDB +from dissect.database.leveldb.leveldb import Record as LevelDBRecord +from dissect.database.util.blink import deserialize_blink_host_object +from dissect.database.util.protobuf import decode_varint + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +try: + import v8serialize + + HAS_V8 = True + +except ImportError: + HAS_V8 = False + + +class IndexedDB: + """Google IndexedDB implementation. + + References: + - https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/docs/leveldb_coding_scheme.md + - https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/docs/README.md + - https://github.com/google/dfindexeddb + - https://www.cclsolutionsgroup.com/post/indexeddb-on-chromium + """ + + databases: list[Database] + + def __init__(self, path: Path): + self.path = path + self.databases = [] + + if not HAS_V8: + raise ImportError( + "Missing required dependency 'v8serialize', install with pip install dissect.database[indexeddb]" + ) + + if not path.exists(): + raise FileNotFoundError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise NotADirectoryError(f"Provided path is not a directory: {path!r}") + + self._leveldb = LevelDB(path) + self._records = list(self._get_records()) + self._metadata = self._get_metadata() + + # TODO: Check for schema version, we only support up to version 5. + + def __repr__(self) -> str: + return f"" + + def _get_records(self) -> Iterator[IndexedDBRecord]: + for record in self._leveldb.records: + yield IndexedDBRecord(record) + + def _get_metadata(self) -> dict[bytes, c_leveldb.Record]: + """Returns a dictionary of the current global metadata of this IndexedDB. + Populates ``self.databases`` found along the way.""" + + metadata = {} + + for record in reversed(self._records): + if ( + record.database_id == 0 + and record.object_store_id == 0 + and record.index_id == 0 + and record.state == c_leveldb.RecordState.LIVE + and (record.key not in metadata or metadata[record.key].sequence < record.sequence) + ): + metadata[record.key] = record + + if record.key[0] == c_indexeddb.GlobalMetaDataType.DatabaseNameKey: + buf = BytesIO(record.key[1:]) + origin = read_varint_value(buf) + name = read_varint_value(buf) + id = read_truncated_int(record.value) + self.databases.append(Database(self, origin, name, id)) + + return metadata + + def database(self, key: int | str) -> Database | None: + """Get a database by id or name, returns on first match.""" + + for database in self.databases: + if (isinstance(key, int) and database.id == key) or (isinstance(key, str) and database.name == key): + return database + return None + + +class Database: + """Represents a single IndexedDB Database.""" + + def __init__(self, indexeddb: IndexedDB, origin: str, name: str, id: int): + self._indexeddb = indexeddb + + self.origin = origin + self.name = name + self.id = id + + self._metadata, self._object_store_metadata = self._get_metadata() + self.object_stores = list(self._get_object_stores()) + + def __repr__(self) -> str: + return f"" # noqa: E501 + + def _get_metadata(self) -> tuple[dict, dict]: + """Return metadata dictionary of this database.""" + + metadata = {} + object_store_metadata = {} + + for record in reversed(self._indexeddb._records): + if ( + record.database_id == self.id + and record.object_store_id == 0 + and record.index_id == 0 + and record.state == c_leveldb.RecordState.LIVE + and (record.key not in metadata or metadata[record.key].sequence < record.sequence) + ): + metadata[record.key] = record + + if record.key[0] == c_indexeddb.DatabaseMetaDataType.MAX_OBJECT_STORE_ID: + self._maximum_object_store_id = read_truncated_int(record.value) + + elif record.key[0] == c_indexeddb.DatabaseMetaDataType.ObjectStoreMetaData: + buf = BytesIO(record.key[1:]) + object_store_id = decode_varint(buf, 10) + object_store_metadata.setdefault(object_store_id, {}) + metadata_type = buf.read(1) + object_store_metadata[object_store_id][metadata_type] = record + + return metadata, object_store_metadata + + def _get_object_stores(self) -> Iterator[ObjectStore]: + for object_store_id, object_store_metadata in self._object_store_metadata.items(): + yield ObjectStore(self, object_store_id, object_store_metadata) + + def object_store(self, key: int | str) -> ObjectStore | None: + """Return an object store based on the given key.""" + + for object_store in self.object_stores: + if (isinstance(key, int) and object_store.id == key) or (isinstance(key, str) and object_store.name == key): + return object_store + return None + + +class ObjectStore: + """Represents a single IndexedDB object store.""" + + id: int + name: str + + def __init__(self, database: Database, id: int, metadata: dict): + self.id = id + self._database = database + self._metadata = metadata + + self.name = self._metadata.get(int.to_bytes(0)).value.decode("utf-16-be") + # TODO: Research if num of records is stored in metadata. + + self.records = list(self._get_records()) + + def __repr__(self) -> str: + return f"" + + def _get_records(self) -> Iterator[IndexedDBKey]: + """Yield stored records in the object store. Currently does not mark deleted records as such.""" + + for record in reversed(self._database._indexeddb._records): + if ( + record.database_id == self._database.id + and record.object_store_id == self.id + and record.index_id == c_indexeddb.IndexIdType.ObjectStoreData + and record.state == c_leveldb.RecordState.LIVE + ): + yield IndexedDBKey(self, record) + + def get(self, key: Any) -> IndexedDBKey | None: + """Return a single record based on the id or an arbitrary key value.""" + + for record in self.records: + if record.key == key: + return record + return None + + def keys(self) -> tuple | None: + """Return a tuple of record keys in this object store.""" + + return tuple(record.key for record in self.records) + + +class IndexedDBKey: + """Represents a single decoded IndexDB key. + + References: + - https://chromium.googlesource.com/chromium/src/+/main/content/browser/indexed_db/indexed_db_leveldb_coding.cc + - https://chromium.googlesource.com/chromium/src/+/main/third_party/blink/public/common/indexeddb/indexeddb_key.h + - https://github.com/v8/v8/blob/master/src/objects/value-serializer.cc + - https://chromium.googlesource.com/chromium/src/third_party/+/master/blink/renderer/bindings/core/v8/serialization + """ + + def __init__(self, object_store: ObjectStore, record: IndexedDBRecord): + self.object_store = object_store + self._record = record + + self.type = None + self.key = None + self.value = None + + self.type, self.key, _ = self._decode_key(self._record.key) + self._decode_value() + + @classmethod + def _decode_key(cls, key_value: bytes) -> tuple[c_indexeddb.IdbKeyType, Any, int]: + """Decode the :class:`IndexedDBRecord` key value.""" + + key_buf = BytesIO(key_value) + type = c_indexeddb.IdbKeyType(key_buf.read(1)[0]) + offset = None + + if type == c_indexeddb.IdbKeyType.Null: + key = None + + elif type == c_indexeddb.IdbKeyType.String: + key = read_varint_value(key_buf) + + elif type == c_indexeddb.IdbKeyType.Date: + ms = c_indexeddb.double(key_buf.read(8)) + key = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(milliseconds=ms) + + elif type == c_indexeddb.IdbKeyType.Number: + key = c_indexeddb.double(key_buf.read(8)) + + elif type == c_indexeddb.IdbKeyType.Array: + key = [] + size = decode_varint(key_buf, 10) + offset = key_buf.tell() + + for _ in range(size): + _, nkey, nsize = cls._decode_key(key_value[offset:]) + offset += nsize + key.append(nkey) + + elif type == c_indexeddb.IdbKeyType.MinKey: + key = None + + elif type == c_indexeddb.IdbKeyType.Binary: + size = decode_varint(key_buf, 10) + key = key_buf.read(size) + + else: + raise ValueError(f"Unknown IndexedDBKey type {type!r}") + + return type, key, offset if offset else key_buf.tell() + + def _decode_value(self) -> None: + """Decode the :class:`IndexedDBRecord` value using Blink and V8. + + Currently does not handle ``kReplaceWithBlob`` IDB value unwrapping. When deserializing fails, + the value of the key is set to the raw bytes instead. + + References: + - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/blink/renderer/modules/indexeddb/idb_value_wrapping.cc + - https://chromium.googlesource.com/chromium/src/+/main/third_party/blink/renderer/bindings/core/v8/serialization/trailer_reader.h + - https://chromium.googlesource.com/chromium/src/+/main/third_party/blink/renderer/modules/indexeddb/idb_value_wrapping.cc + """ + + if not self._record.value: + return + + value_buf = BytesIO(self._record.value) + value_header = c_indexeddb.IdbValueHeader(value_buf) + + if value_header.blink_tag != 0xFF: + raise ValueError(f"Invalid Blink tag {value_header.blink_tag!r}") + + # Determine if a Blink trailer is present + if value_header.blink_version >= c_indexeddb.kMinWireFormatVersion: + self._value_blink_trailer = value_buf.read(13) + self._raw_object = value_buf.read() + + try: + self.value = v8serialize.loads( + data=self._raw_object, + jsmap_type=dict, + js_object_type=dict, + js_array_type=dict, + default_timezone=timezone.utc, + host_object_deserializer=deserialize_blink_host_object, + ) + except Exception: + self.value = self._raw_object + + def __repr__(self) -> str: + return f"" + + +class IndexedDBRecord: + """A single IndexedDB record constructed from a LevelDB record.""" + + def __init__(self, record: LevelDBRecord): + self._record = record + self._prefix = c_indexeddb.KeyPrefix(record.key) + + self.key = record.key[len(self._prefix.dumps()) :] + self.value = record.value + self.state = record.state + self.sequence = record.sequence + + self.database_id = int.from_bytes(self._prefix.database_id, "little") + self.object_store_id = int.from_bytes(self._prefix.object_store_id, "little") + self.index_id = int.from_bytes(self._prefix.index_id, "little") + + def __repr__(self) -> str: + return f"" # noqa: E501 + + +def read_varint_value(buf: BinaryIO) -> str: + """Read the database name from a DatabaseNameKey buffer. + + References: + - https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/docs/leveldb_coding_scheme.md + """ + + length = decode_varint(buf, 10) + return buf.read(length * 2).decode("utf-16-be") + + +def read_truncated_int(input: bytes) -> int: + """Read a truncated integer from the given byte(s). + + References: + - https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/indexed_db_leveldb_coding.h#EncodeInt + """ + + result = 0 + for i, b in enumerate(input): + result |= b << (i * 8) + return result diff --git a/dissect/database/leveldb/__init__.py b/dissect/database/leveldb/__init__.py new file mode 100644 index 0000000..1b6e9d4 --- /dev/null +++ b/dissect/database/leveldb/__init__.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from dissect.database.leveldb.c_leveldb import c_leveldb +from dissect.database.leveldb.leveldb import LevelDB, LogBlock, LogFile, Record + +__all__ = [ + "LevelDB", + "LogBlock", + "LogFile", + "Record", + "c_leveldb", +] diff --git a/dissect/database/leveldb/c_leveldb.py b/dissect/database/leveldb/c_leveldb.py new file mode 100644 index 0000000..6a4e766 --- /dev/null +++ b/dissect/database/leveldb/c_leveldb.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +from dissect.database.util.protobuf import ProtobufVarint, ProtobufVarint32 + +leveldb_def = """ +/* + * LevelDB log file structures. + */ + +#define LOG_BLOCK_SIZE 0x8000 +#define LOG_ENTRY_HEADER_SIZE 7 + +enum LogBlockType : uint8 { + FULL = 1, + FIRST = 2, + MIDDLE = 3, + LAST = 4, +}; + +struct LogBlockHeader { + uint32 crc32c; + uint16 size; // size of the first record + LogBlockType type; +}; + +struct BatchHeader { + uint64 seq_num; + uint32 rec_count; +}; + +enum RecordState : uint8 { + DELETED = 0, + LIVE = 1, + UNKNOWN = 2, +}; + +// TODO: Rename to LogRecord +struct Record { + RecordState state; + varint key_len; + char key[key_len]; + // varint value_len; // if state != DELETED + // char value[value_len]; // if state != DELETED +}; + + +/* + * LevelDB .ldb file structures. + */ +#define LDB_MAGIC 0xdb4775248b80fb57 +#define LDB_FOOTER_SIZE 48 +#define LDB_BLOCK_TRAILER_SIZE 5 + +struct BlockHandle { + varint offset; // varint64 + varint length; // varint64 +}; + +struct BlockEntry { + varint32 shared_len; + varint32 non_shared_len; + varint32 value_len; + char key[0]; // shared key computed at runtime + char value[0]; // value_len read at runtime +}; + +enum CompressionType : uint8 { + NONE = 0, + SNAPPY = 1, +}; + +struct BlockTrailer { + CompressionType compression; + char crc32c[4]; +}; + +struct LdbFooter { + BlockHandle meta_index_handle; + BlockHandle index_handle; + // char padding[40-sizeof(meta_index_handle)-sizeof(index_handle)]; + // char magic[8]; +}; +""" + +c_leveldb = cstruct(endian="<") +c_leveldb.add_custom_type("varint", ProtobufVarint, size=None, alignment=1, signed=False) +c_leveldb.add_custom_type("varint64", ProtobufVarint, size=None, alignment=1, signed=False) +c_leveldb.add_custom_type("varint32", ProtobufVarint32, size=None, alignment=1, signed=False) +c_leveldb.load(leveldb_def) diff --git a/dissect/database/leveldb/c_leveldb.pyi b/dissect/database/leveldb/c_leveldb.pyi new file mode 100644 index 0000000..a77cc8b --- /dev/null +++ b/dissect/database/leveldb/c_leveldb.pyi @@ -0,0 +1,116 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_leveldb(__cs__.cstruct): + LOG_BLOCK_SIZE: Literal[32768] = ... + LOG_ENTRY_HEADER_SIZE: Literal[7] = ... + LDB_MAGIC: Literal[15800726617472432983] = ... + LDB_FOOTER_SIZE: Literal[48] = ... + LDB_BLOCK_TRAILER_SIZE: Literal[5] = ... + class varint(__cs__.ProtobufVarint): ... + class varint64(__cs__.ProtobufVarint): ... + class varint32(__cs__.ProtobufVarint32): ... + + class LogBlockType(__cs__.Enum): + FULL = ... + FIRST = ... + MIDDLE = ... + LAST = ... + + class LogBlockHeader(__cs__.Structure): + crc32c: _c_leveldb.uint32 + size: _c_leveldb.uint16 + type: _c_leveldb.LogBlockType + @overload + def __init__( + self, + crc32c: _c_leveldb.uint32 | None = ..., + size: _c_leveldb.uint16 | None = ..., + type: _c_leveldb.LogBlockType | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class BatchHeader(__cs__.Structure): + seq_num: _c_leveldb.uint64 + rec_count: _c_leveldb.uint32 + @overload + def __init__(self, seq_num: _c_leveldb.uint64 | None = ..., rec_count: _c_leveldb.uint32 | None = ...): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class RecordState(__cs__.Enum): + DELETED = ... + LIVE = ... + UNKNOWN = ... + + class Record(__cs__.Structure): + state: _c_leveldb.RecordState + key_len: _c_leveldb.varint + key: __cs__.CharArray + @overload + def __init__( + self, + state: _c_leveldb.RecordState | None = ..., + key_len: _c_leveldb.varint | None = ..., + key: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class BlockHandle(__cs__.Structure): + offset: _c_leveldb.varint + length: _c_leveldb.varint + @overload + def __init__(self, offset: _c_leveldb.varint | None = ..., length: _c_leveldb.varint | None = ...): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class BlockEntry(__cs__.Structure): + shared_len: _c_leveldb.varint32 + non_shared_len: _c_leveldb.varint32 + value_len: _c_leveldb.varint32 + key: __cs__.CharArray + value: __cs__.CharArray + @overload + def __init__( + self, + shared_len: _c_leveldb.varint32 | None = ..., + non_shared_len: _c_leveldb.varint32 | None = ..., + value_len: _c_leveldb.varint32 | None = ..., + key: __cs__.CharArray | None = ..., + value: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class CompressionType(__cs__.Enum): + NONE = ... + SNAPPY = ... + + class BlockTrailer(__cs__.Structure): + compression: _c_leveldb.CompressionType + crc32c: __cs__.CharArray + @overload + def __init__( + self, compression: _c_leveldb.CompressionType | None = ..., crc32c: __cs__.CharArray | None = ... + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class LdbFooter(__cs__.Structure): + meta_index_handle: _c_leveldb.BlockHandle + index_handle: _c_leveldb.BlockHandle + @overload + def __init__( + self, + meta_index_handle: _c_leveldb.BlockHandle | None = ..., + index_handle: _c_leveldb.BlockHandle | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_leveldb` is an instance of `_c_leveldb`, but then we can't use it in type hints +c_leveldb: TypeAlias = _c_leveldb diff --git a/dissect/database/leveldb/exception.py b/dissect/database/leveldb/exception.py new file mode 100644 index 0000000..4bc5c3f --- /dev/null +++ b/dissect/database/leveldb/exception.py @@ -0,0 +1,7 @@ +from __future__ import annotations + +from dissect.database.exception import Error + + +class LevelDBError(Error, ValueError): + pass diff --git a/dissect/database/leveldb/leveldb.py b/dissect/database/leveldb/leveldb.py new file mode 100644 index 0000000..c34e98d --- /dev/null +++ b/dissect/database/leveldb/leveldb.py @@ -0,0 +1,365 @@ +from __future__ import annotations + +import os +from io import BytesIO +from itertools import chain +from typing import TYPE_CHECKING, BinaryIO + +from dissect.cstruct import u32, u64 + +from dissect.database.leveldb.c_leveldb import c_leveldb +from dissect.database.leveldb.exception import LevelDBError +from dissect.database.util.protobuf import decode_varint + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +try: + from cramjam import snappy + + HAS_CRAMJAM = True + +except ImportError: + HAS_CRAMJAM = False + + +class LevelDB: + """Google LevelDB implementation. + + References: + - https://github.com/google/leveldb/blob/main/doc/log_format.md + - https://github.com/google/leveldb/blob/master/doc/table_format.md + - https://www.cclsolutionsgroup.com/post/hang-on-thats-not-sqlite-chrome-electron-and-leveldb + """ + + path: Path + manifests: list[ManifestFile] + log_files: list[LogFile] + ldb_files: list[LdbFile] + + def __init__(self, path: Path): + self.path = path + self.manifests = [] + self.log_files = [] + self.ldb_files = [] + + if not path.exists(): + raise FileNotFoundError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise NotADirectoryError(f"Provided path is not a directory: {path!r}") + + for file in path.iterdir(): + if not file.is_file(): + continue + + if file.suffix.lower() == ".log": + self.log_files.append(LogFile(path=file)) + + elif file.suffix.lower() in (".ldb", ".sst"): + self.ldb_files.append(LdbFile(path=file)) + + elif file.name.startswith("MANIFEST-"): + self.manifests.append(ManifestFile(path=file)) + + self.records = list(self._records()) + + def __repr__(self) -> str: + return f"" + + def _records(self) -> Iterator[Record]: + """Iterate over all records in this LevelDB.""" + for file in chain(self.ldb_files, self.log_files): + yield from file.records + + +class LogFile: + """Represents a single LevelDB log file.""" + + path: Path | None = None + fh: BinaryIO + blocks: list[LogBlock] + + def __init__(self, *, path: Path | None = None, fh: BinaryIO | None = None): + if path: + self.path = path + self.fh = path.open("rb") + + elif fh: + self.fh = fh + + if not path and not fh: + raise LevelDBError("LogFile requires one of path or fh") + + self.blocks = list(self._iter_batches()) + + def __repr__(self) -> str: + return f"" + + def _iter_chunks(self) -> Iterator[BytesIO, int]: + """Yields chunks of 32KB from the logfile file handle.""" + + while buf := self.fh.read(c_leveldb.LOG_BLOCK_SIZE): + yield BytesIO(buf), len(buf) + + def _iter_batches(self) -> Iterator: + """Yields stitched :class:`LogBlock` instances.""" + + chunk_buffer = b"" # Perhaps we should use a stream here in the future + + for chunk, size in self._iter_chunks(): + while chunk.tell() < size: + header = c_leveldb.LogBlockHeader(chunk) + + if header.type == c_leveldb.LogBlockType.FULL: + yield LogBlock(None, header, BytesIO(chunk.read(header.size))) + + elif header.type == c_leveldb.LogBlockType.FIRST: + chunk_buffer = chunk.read(header.size) + + elif header.type == c_leveldb.LogBlockType.MIDDLE: + chunk_buffer += chunk.read(header.size) + + elif header.type == c_leveldb.LogBlockType.LAST: + chunk_buffer += chunk.read(header.size) + yield LogBlock(None, header, BytesIO(chunk_buffer)) + + @property + def records(self) -> Iterator[Record]: + """Convenience method to iterate over all blocks for their respective records.""" + for block in self.blocks: + yield from block.records + + +class LogBlock: + """Represents a single LevelDB block.""" + + header: c_leveldb.LogBlockHeader + records: list[c_leveldb.Record] + + type: c_leveldb.LogBlockType + + def __init__(self, fh: BinaryIO, header: c_leveldb.LogBlockHeader | None = None, data: BinaryIO | None = None): + if header: + self.header = header + else: + self.header = c_leveldb.LogBlockHeader(fh) + + self.type = self.header.type + + if data: + self.data = data + else: + self.data = BytesIO(fh.read(self.header.size)) + + self.records = list(self._iter_records()) + + def __repr__(self) -> str: + return f"" + + def _iter_records(self) -> Iterator[Record]: + while self.data.tell() < self.header.size: + batch_header = c_leveldb.BatchHeader(self.data) + for _ in range(batch_header.rec_count): + yield Record(self.data, batch_header) + + +class Record: + """Represents a single LevelDB key/value record pair.""" + + state: c_leveldb.RecordState | None = None + sequence: int | None = None + key: bytes + value: bytes + metadata: bytes + + def __init__(self, fh: BinaryIO | None, batch_header: c_leveldb.BatchHeader | None) -> None: + self.batch_header = batch_header + self.fh = fh + + if fh: + self.header = c_leveldb.Record(fh) + self.state = self.header.state + self.key = self.header.key + + if self.state == c_leveldb.RecordState.DELETED: + self.header.value_len = 0 + self.value = b"" + elif fh: + self.header.value_len = decode_varint(fh, 5) + self.value = self.fh.read(self.header.value_len) + + if batch_header: + self.sequence = batch_header.seq_num + + def __repr__(self) -> str: + return f"" + + +class LdbFile: + """Represents a single LevelDB ``.ldb`` file.""" + + path: Path | None = None + fh: BinaryIO + records: Iterator[Record] + footer: c_leveldb.LdbFooter + + def __init__(self, *, path: Path | None = None, fh: BinaryIO | None = None): + if path: + self.path = path + self.fh = path.open("rb") + + elif fh: + self.fh = fh + + if not path and not fh: + raise LevelDBError("LdbFile requires one of path or fh") + + self.fh.seek(-c_leveldb.LDB_FOOTER_SIZE, os.SEEK_END) + self.footer = c_leveldb.LdbFooter(self.fh) + + self.fh.seek(-8, os.SEEK_END) + self.footer.magic = u64(self.fh.read()) + + if self.footer.magic != c_leveldb.LDB_MAGIC: + raise LevelDBError(f"Invalid LevelDB footer magic {self.footer.magic!r}") + + self.meta_index_block = LdbMetaIndexBlock(self.fh, self.footer.meta_index_handle) + self.index_block = LdbIndexBlock(self.fh, self.footer.index_handle) + + self._records = [] + + def __repr__(self) -> str: + return f"" + + @property + def records(self) -> Iterator[Record]: + if self._records: + yield from self._records + + for _, handle in self.index_block.entries: + block = LdbBlock(self.fh, handle) + for block_entry, _ in block.entries: + record = Record(None, None) + record.metadata = block_entry.key[-8:] + record.sequence = u64(record.metadata) >> 8 + + if len(block_entry.key) > 8: + record.state = ( + c_leveldb.RecordState.DELETED if block_entry.key[-8] == 0 else c_leveldb.RecordState.LIVE + ) + else: + record.state = c_leveldb.RecordState.UNKNOWN + + record.key = block_entry.key[:-8] + record.value = block_entry.value + self._records.append(record) + yield record + + +class LdbBlock: + """Represents a single LevelDB ``.ldb`` file block. + + Unlike :class:`LogBlock`, blocks in ``.ldb`` files do not have a fixed length:: + + | block_entry[n] | <- can be compressed + | -------------- | + | restart_array | <- can be compressed + | -------------- | + | trailer | + + """ + + def __init__(self, fh: BinaryIO, block_handle: c_leveldb.BlockHandle): + self.block_handle = block_handle + + fh.seek(block_handle.offset) + self.raw_data = fh.read(block_handle.length) + self.trailer = c_leveldb.BlockTrailer(fh.read(c_leveldb.LDB_BLOCK_TRAILER_SIZE)) + + self.offset = self.block_handle.offset + self.compression = self.trailer.compression + self.crc32c = self.trailer.crc32c + self.size = self.block_handle.length + self.size_decompressed = self.size + + if len(self.raw_data) != block_handle.length or len(self.trailer.dumps()) != c_leveldb.LDB_BLOCK_TRAILER_SIZE: + raise LevelDBError(f"Unable to read full LdbBlock at offset {block_handle.offset}") + + if self.trailer.compression == c_leveldb.CompressionType.SNAPPY: + if not HAS_CRAMJAM: + raise ImportError( + "Unable to decompress snappy LdbBlock: missing dependency cramjam, install with 'pip install dissect.database[leveldb]'" # noqa: E501 + ) + try: + self.data = snappy.decompress_raw(self.raw_data) + self.size_decompressed = self.data.len() + except snappy.DecompressionError as e: + raise LevelDBError("Unable to decompress LdbBlock: snappy decompression failed") from e + else: + self.data = BytesIO(self.raw_data) + + # Restart pointer is stored after all block entries. + self.data.seek(-4, os.SEEK_END) + self._restart_count = u32(self.data.read()) + self._restart_offset = self.size_decompressed - (self._restart_count + 1) * 4 + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} offset={self.offset!r} size={self.size!r} crc32c={self.crc32c.hex()!r} compression={self.compression.name!r} size_decompressed={self.size_decompressed!r}>" # noqa: E501 + + def restart_offset(self, idx: int) -> int: + offset = self._restart_offset + (idx * 4) + self.data.seek(offset) + return u32(self.data.read(4), sign=True) + + @property + def entries(self) -> Iterator[tuple[c_leveldb.BlockEntry, c_leveldb.BlockHandle]]: + offset = self.restart_offset(0) + self.data.seek(offset) + + if offset >= self._restart_offset: + raise LevelDBError("Reading start of entry past the start of restart offset") + + key = b"" + while self.data.tell() < self._restart_offset: + entry = c_leveldb.BlockEntry(self.data) + + if entry.shared_len > len(key): + raise LevelDBError("Shared key length is longer than the previous key") + + key = key[: entry.shared_len] + self.data.read(entry.non_shared_len) + entry.key = key + entry.value = self.data.read(entry.value_len) + + handle = c_leveldb.BlockHandle(entry.value) + + yield entry, handle + + +class LdbMetaIndexBlock(LdbBlock): + """Represents a single LevelDB ``.ldb`` meta index block.""" + + +class LdbIndexBlock(LdbBlock): + """Represents a single LevelDB ``.ldb`` index block.""" + + +class ManifestFile: + """Represents a single ``MANIFEST-*`` file.""" + + path: Path | None + fh: BinaryIO + + def __init__(self, *, path: Path | None = None, fh: BinaryIO | None = None): + if path: + self.path = path + self.fh = path.open("rb") + elif fh: + self.fh = fh + + if not path and not fh: + raise LevelDBError("ManifestFile requires one of path or fh") + + def __repr__(self) -> str: + return f"" diff --git a/dissect/database/util/__init__.py b/dissect/database/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dissect/database/util/blink.py b/dissect/database/util/blink.py new file mode 100644 index 0000000..1623e13 --- /dev/null +++ b/dissect/database/util/blink.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum + +try: + import v8serialize + + HAS_V8 = True + +except ImportError: + HAS_V8 = False + + +class BlinkTagTypes(Enum): + """Blink tag types.""" + + BlobIndexTag = b"i" + FileIndexTag = b"e" + # NativeFileSystemFileHandleTag = b"n" + # NativeFileSystemDirectoryHandleTag = b"N" + FileListIndexTag = b"L" + # CryptoKeyTag = b"K" + + +@dataclass +class BlinkBlobIndex: + index_id: int + + +@dataclass +class BlinkFileIndex: + index_id: int + + +BlinkType = BlinkBlobIndex, BlinkFileIndex, list[BlinkFileIndex] + + +def deserialize_blink_host_object(*, stream: v8serialize.decode.ReadableTagStream) -> BlinkType: + """Support for deserializing Blink tags in V8. + + HostObject tags are the V8 serialization format's way to allow an application to insert + their own custom data into the serialized data. + + Currently does not support ``CryptoKeyTag``, ``NativeFileSystemFileHandleTag`` + and ``NativeFileSystemDirectoryHandleTag`` tags. + + References: + - https://chromium.googlesource.com/chromium/src/+/main/third_party/blink/renderer/bindings/core/v8/serialization/v8_script_value_deserializer.cc + """ + + if not HAS_V8: + raise ImportError( + "Unable to deserialize Blink object: missing dependency v8serialize, install with 'pip install dissect.database[indexeddb]" # noqa: E501 + ) + + tag = BlinkTagTypes(stream.read_bytes(1)) + + if tag == BlinkTagTypes.BlobIndexTag: + return BlinkBlobIndex(stream.read_varint()) + + if tag == BlinkTagTypes.FileIndexTag: + return BlinkFileIndex(stream.read_varint()) + + if tag == BlinkTagTypes.FileListIndexTag: + len = stream.read_varint() + return [BlinkFileIndex(stream.read_varint()) for _ in range(len)] + + raise BlinkHostObjectHandlerDecodeError( + f"Unable to deserialize Blink object: unknown BlinkTagType {tag!r}", + position=stream.pos - 1, + data=stream.data, + ) + + +class BlinkHostObjectHandlerDecodeError(v8serialize.DecodeV8SerializeError): + """Raised when decoding a HostObject as a Blink buffer fails.""" diff --git a/dissect/database/util/protobuf.py b/dissect/database/util/protobuf.py new file mode 100644 index 0000000..ac6e9c6 --- /dev/null +++ b/dissect/database/util/protobuf.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from typing import Any, BinaryIO + +from dissect.cstruct.types.base import BaseType + + +class ProtobufVarint(BaseType): + """Implements a protobuf integer type for dissect.cstruct that can span a variable amount of bytes. + + Supports protobuf's msb varint implementation. + + References: + - https://protobuf.dev/programming-guides/encoding/ + - https://github.com/protocolbuffers/protobuf/blob/main/python/google/protobuf/internal/decoder.py + """ + + varint_limit: int = 10 + + @classmethod + def _read(cls, stream: BinaryIO, context: dict[str, Any] | None = None) -> int: + return decode_varint(stream, cls.varint_limit) + + @classmethod + def _write(cls, stream: BinaryIO, data: int) -> int: + return stream.write(encode_varint(data)) + + +class ProtobufVarint32(ProtobufVarint): + varint_limit: int = 5 + + +def decode_varint(stream: BinaryIO, limit: int) -> int: + """Reads a varint from the provided buffer stream. + + If we have not reached the end of a varint, the msb will be 1. + We read every byte from our current position until the msb is 0. + """ + result = 0 + i = 0 + while i < limit: + byte = stream.read(1) + if len(byte) < 1: + break + + result |= (byte[0] & 0x7F) << (i * 7) + i += 1 + if byte[0] & 0x80 == 0: + break + + return result + + +def encode_varint(number: int) -> bytes: + """Encode a decoded protobuf varint to its original bytes.""" + buf = [] + while True: + towrite = number & 0x7F + number >>= 7 + if number: + buf.append(towrite | 0x80) + else: + buf.append(towrite) + break + return bytes(buf) diff --git a/pyproject.toml b/pyproject.toml index 825c79a..51d0771 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dissect.database" -description = "A Dissect module implementing parsers for various database formats, including Berkeley DB, Microsofts Extensible Storage Engine (ESE) and SQLite3" +description = "A Dissect module implementing parsers for various database formats, including Berkeley DB, Google's LevelDB, Microsoft's Extensible Storage Engine (ESE) and SQLite3" readme = "README.md" requires-python = ">=3.10" license = "Apache-2.0" @@ -41,9 +41,19 @@ dev = [ "dissect.util>=3.5.dev,<4.0.dev", ] +leveldb = [ + "cramjam>=2.11.0,<3", # required for snappy decompression +] + +indexeddb = [ + "v8serialize>=0.4.0rc0,<1", # required for js object decoding +] + [dependency-groups] test = [ "pytest", + "cramjam", + "v8serialize", ] lint = [ "ruff==0.13.1", diff --git a/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000005.ldb b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000005.ldb new file mode 100644 index 0000000..d9f6008 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000005.ldb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b0ce0b3e0beb4746347d77c98e4a8e50ac13c7f62f0c55d2558cf863b4bc901 +size 1251379 diff --git a/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000007.ldb b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000007.ldb new file mode 100644 index 0000000..826c4df --- /dev/null +++ b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000007.ldb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3169b41653bc4d8881abfb451f84858c2bcf9a7181f7801d17129e256b1380eb +size 1399944 diff --git a/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000008.log b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000008.log new file mode 100644 index 0000000..56258f9 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000008.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e588ae2e03b8fa6651bd22173d8762062922bf08621d6d870ae041ed10e38e +size 38 diff --git a/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/CURRENT b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/CURRENT new file mode 100644 index 0000000..55c21f4 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1bad70c7bd1e0a69562853ec529355462fcd0423263a3d39d6d0d70b780443 +size 16 diff --git a/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/MANIFEST-000001 b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/MANIFEST-000001 new file mode 100644 index 0000000..f9a2151 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/MANIFEST-000001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9042197e0fa23711bb8a34c1ffbb6b5d6e1aa7917004364ef5c5aec00943c14e +size 650 diff --git a/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/000003.log b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/000003.log new file mode 100644 index 0000000..32e59f0 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/000003.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b61913307392ca4cf90c9b13090f54ddea79b5fd1551bd674dbe6c58c6e8e6 +size 38632 diff --git a/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/CURRENT b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/CURRENT new file mode 100644 index 0000000..55c21f4 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1bad70c7bd1e0a69562853ec529355462fcd0423263a3d39d6d0d70b780443 +size 16 diff --git a/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 new file mode 100644 index 0000000..dbc0bce --- /dev/null +++ b/tests/_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720a78803b84cbcc8eb204d5cf8ea6ee2f693be0ab2124ddf2b81455de02a3ed +size 23 diff --git a/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/000003.log b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/000003.log new file mode 100644 index 0000000..673612e --- /dev/null +++ b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/000003.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b2b5fdb5d8736d95ebd64092a0ba0688d3c19391a37ab97ea4790c18e1b768 +size 8798 diff --git a/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/CURRENT b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/CURRENT new file mode 100644 index 0000000..55c21f4 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1bad70c7bd1e0a69562853ec529355462fcd0423263a3d39d6d0d70b780443 +size 16 diff --git a/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 new file mode 100644 index 0000000..dbc0bce --- /dev/null +++ b/tests/_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/MANIFEST-000001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720a78803b84cbcc8eb204d5cf8ea6ee2f693be0ab2124ddf2b81455de02a3ed +size 23 diff --git a/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/000003.log b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/000003.log new file mode 100644 index 0000000..cd02ae7 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/000003.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb9a2bbd55254bc4344bc6572fb680bc57e7dbbb97ef4f726cff549136fe446 +size 16262 diff --git a/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/CURRENT b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/CURRENT new file mode 100644 index 0000000..55c21f4 --- /dev/null +++ b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1bad70c7bd1e0a69562853ec529355462fcd0423263a3d39d6d0d70b780443 +size 16 diff --git a/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/MANIFEST-000001 b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/MANIFEST-000001 new file mode 100644 index 0000000..dbc0bce --- /dev/null +++ b/tests/_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb/MANIFEST-000001 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720a78803b84cbcc8eb204d5cf8ea6ee2f693be0ab2124ddf2b81455de02a3ed +size 23 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..987d913 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import importlib.util + +import pytest + +HAS_BENCHMARK = importlib.util.find_spec("pytest_benchmark") is not None + + +def pytest_configure(config: pytest.Config) -> None: + if not HAS_BENCHMARK: + # If we don't have pytest-benchmark (or pytest-codspeed) installed, register the benchmark marker ourselves + # to avoid pytest warnings + config.addinivalue_line("markers", "benchmark: mark test for benchmarking (requires pytest-benchmark)") + + +def pytest_runtest_setup(item: pytest.Item) -> None: + if not HAS_BENCHMARK and item.get_closest_marker("benchmark") is not None: + pytest.skip("pytest-benchmark is not installed") diff --git a/tests/indexeddb/__init__.py b/tests/indexeddb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/indexeddb/test_benchmark.py b/tests/indexeddb/test_benchmark.py new file mode 100644 index 0000000..52e3049 --- /dev/null +++ b/tests/indexeddb/test_benchmark.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from dissect.database import IndexedDB +from tests._util import absolute_path + +if TYPE_CHECKING: + from pytest_benchmark.fixture import BenchmarkFixture + + +@pytest.mark.benchmark +def test_benchmark_indexeddb(benchmark: BenchmarkFixture) -> None: + """Test if we can parse a medium sized IndexedDB.""" + + path = absolute_path("_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb") + records = benchmark(lambda: IndexedDB(path).database("ExampleDatabase").object_store("MyObjectStore").records) + + assert len(records) == 10_002 + assert records[-1].key == 1 + assert records[-1].value == {"id": 1, "name": {"first": "John", "last": "Doe"}, "age": 42} diff --git a/tests/indexeddb/test_simple.py b/tests/indexeddb/test_simple.py new file mode 100644 index 0000000..31a2e3f --- /dev/null +++ b/tests/indexeddb/test_simple.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from dissect.database.indexeddb.indexeddb import IndexedDB +from tests._util import absolute_path + + +def test_indexeddb_basic_example() -> None: + """Test if we can serialize a basic IndexedDB example. + + References: + - https://mdn.github.io/dom-examples/indexeddb-api/index.html + """ + + path = absolute_path("_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb") + indexeddb = IndexedDB(path) + + assert len(indexeddb.databases) == 1 + + database = indexeddb.database(1) + + assert database.id == 1 + assert database.origin == "https_mdn.github.io_0@1" + assert database.name == "mdn-demo-indexeddb-epublications" + assert database._maximum_object_store_id == 1 + assert len(database.object_stores) == 1 + + object_store = database.object_store(1) + assert object_store.name == "publications" + + assert len(object_store.records) == 5 + + # This record was deleted (a marker exists but is currently not parsed) + record = object_store.get(4) + assert record.value["biblioid"] == "978-0141036144" + assert record.value["title"] == "1984" + assert record.value["year"] == 1949 + assert record.value["blob"].index_id == 0 + + # This is the next regular record + record = object_store.get(5) + assert record.value["biblioid"] == "978-0007532278" + assert record.value["title"] == "I, Robot" + assert record.value["year"] == 1950 + assert record.value["blob"].index_id == 0 diff --git a/tests/indexeddb/test_types.py b/tests/indexeddb/test_types.py new file mode 100644 index 0000000..314ccbe --- /dev/null +++ b/tests/indexeddb/test_types.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +from v8serialize.jstypes import JSRegExp, JSUndefined + +from dissect.database.indexeddb.indexeddb import IndexedDB +from dissect.database.util.blink import BlinkBlobIndex +from tests._util import absolute_path + + +def test_indexeddb_different_types() -> None: + """Test if we can parse different IndexedDB value types. + + References: + - https://github.com/cclgroupltd/ccl_chromium_reader/blob/master/tools_and_utilities/extras/make_test_indexeddb.html + """ + + path = absolute_path("_data/leveldb/indexeddb/types/file__0.indexeddb.leveldb") + + indexeddb = IndexedDB(path) + database = indexeddb.database("MyTestDatabase") + object_store = database.object_store("store") + + assert len(object_store.records) == 19 + + # Basics + record = object_store.get("basics") + assert record.value == { + "id": "basics", + "true": True, + "false": False, + "null": None, + "undefined": JSUndefined, + "string_1a": "this string literal is repeated", + "string_1b": "this string literal is repeated", + "string_2a": "this string object is repeated", + "string_2b": "this string object is repeated", + "the_number_100": 100, + "the_number_1000000000": 1000000000, + "the_number_1.5": 1.5, + "aRegex": JSRegExp("[A-z]{3}", 0), + "date": datetime(2022, 11, 21, 16, 0, tzinfo=timezone.utc), + } + + # Big integers + record = object_store.get("the_one_with_bigints") + assert record.value == { + "id": "the_one_with_bigints", + "a_BigInt": 1000, + "a_neg_bigInt": -1000, + "a_hugeInt": 100000000000000000000000, + "beeegInt": 605951652385920480004291274127545002769392376323959195201050263810007255232559947887954882812532343572574055983, # noqa: E501 + } + + # Collections + record = object_store.get("the_one_with_collections") + assert record.value["id"] == "the_one_with_collections" + assert record.value["dense_array"] == {0: "one", 1: "two", 2: "three", 3: "four"} + assert record.value["sparse_array"][32] == "ELEMENT AT INDEX 32" + assert record.value["sparse_array"][92] == "ELEMENT AT INDEX 92" + assert record.value["inner_object"] == {"key1": "value1", "key2": "value2", "key3": "value3"} + assert record.value["map"] == {"map_key1": "map_value1", "map_key2": "map_value2", "map_key3": "map_value3"} + assert record.value["set"] == {"set_value2", "set_value1", "set_value3", "set_value4"} + + # Array buffers + record = object_store.get("the_one_with_array_buffers") + assert record.value["id"] == "the_one_with_array_buffers" + assert bytes(record.value["array_buffer"]) == bytes.fromhex( + "0100000002000000030000000400000005000000060000000000000000000000" + "0000000000000000000000000000000000000000000000000000000000000000" + "0000000000000000000000000000000000000000000000000000000000000000" + "0000000000000000000000000000000000000000000000000000000000000000" + ) + assert tuple(record.value["int32_buffer"].get_buffer()) == (1, 2, 3, 4, 5, 6, *tuple(0 for _ in range(26))) + + # Cyclic references + record = object_store.get("the_one_with_cyclic_references") + assert record.value["id"] == "the_one_with_cyclic_references" + assert record.value["one_layer"]["parent"] is None + assert record.value["one_layer"]["children"][0]["parent"] == record.value["one_layer"] # recursion, baby! + assert record.value["one_layer"]["children"][0]["children"] == {} + assert ( + record.value["three_item_cyclic_linked_list"]["next1"]["prev2"] + == record.value["three_item_cyclic_linked_list"]["prev1"]["prev3"]["prev2"] + ) + + # Strings + record = object_store.get("the_one_with_unicode") + assert record.value["id"] == "the_one_with_unicode" + assert record.value["all_ascii"] == "hello world" + assert record.value["ascii_plus_latin1"] == "hélló wórld" + assert record.value["ascii_plus_emoji"] == "hell😮 world" + assert record.value["all_unicode"] == "😛😫😋😎" + + # Primitives and objects + record = object_store.get("the_one_with_primitives_and_objects") + assert record.value["id"] == "the_one_with_primitives_and_objects" + assert record.value["string_primitive"] == "hello primitive" + assert record.value["string_object"] == "hello object" + assert record.value["bool_primitive_true"] is True + assert record.value["bool_object_true"] is True + assert record.value["number_primitive_1000"] == 1000 + assert record.value["number_object_1000"] == 1000 + assert record.value["bigint_primitive_abcdefabcdefabcdefabcdef"] == 53170898287292916380478459375 + assert record.value["bigint_object_abcdefabcdefabcdefabcdef"] == 53170898287292916380478459375 + + # Different types in primary keys + record = object_store.get(["an", "array", "of", "text"]) + assert record.value["text"] == "primary key is an array of text" + + record = object_store.get(1000) + assert record.value["id"] == 1000 + assert record.value["text"] == "primary key is the integer 1000" + + record = object_store.get(["an", "array", "of", "text", "and", "an", "integer", 1000]) + assert record.value["text"] == "primary is an array of text with a number at the end" + + # TODO: Big record for kIDBWrapThreshold + record = object_store.get("a_big_record_to_test_kIDBWrapThreshold_in_chrome") + assert record.value == b"\x01\x9e\xc0\x0f\x00" + record = object_store.get( + "a_big_record_to_test_kIDBWrapThreshold_in_chrome_plus_a_file_to_check_how_mozilla_does_that" + ) + assert record.value == b"\x01\xd2\xc0\x0f\x00" + + # TODO: Cryptography objects + record = object_store.get("the_one_with_crypto_objects") + record.value.startswith(b'\xff\x0fo"\x02id"\x1bthe_one_with_crypto_objects"\x03rsa') + assert len(record.value) == 3572 + + # TODO: File blobs + record = object_store.get("the_one_with_a_blob") + assert record.value["id"] == "the_one_with_a_blob" + assert record.value["blob"] == BlinkBlobIndex(index_id=0) + + # Key is a nested array with a string + record = object_store.get([["foo"]]) + assert record.value["id"] == {0: {0: "foo"}} + assert record.value["desc"] == 'key is [["foo"]]' + + # Key is a nested array without values + record = object_store.get([[]]) + assert record.value["id"] == {0: {}} + assert record.value["desc"] == "key is [[]]" + + # Key is a datetime object + record = object_store.get(datetime(2024, 4, 30, 22, 0, tzinfo=timezone.utc)) + assert record.value["id"] == datetime(2024, 4, 30, 22, 0, tzinfo=timezone.utc) + assert record.value["desc"] == "key is new Date(2024, 4, 1)" + + # Key is a nested array with integers + record = object_store.get([[[1, 2], 3, [[4], 5, 6], [7, [8, 9]]], 10]) + assert record.value["id"] == { + 0: {0: {0: 1, 1: 2}, 1: 3, 2: {0: {0: 4}, 1: 5, 2: 6}, 3: {0: 7, 1: {0: 8, 1: 9}}}, + 1: 10, + } + assert record.value["desc"] == "key is [[[1,2], 3, [[4], 5, 6], [7, [8, 9]]], 10]" diff --git a/tests/leveldb/__init__.py b/tests/leveldb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/leveldb/test_indexeddb.py b/tests/leveldb/test_indexeddb.py new file mode 100644 index 0000000..64c0d1c --- /dev/null +++ b/tests/leveldb/test_indexeddb.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from dissect.database.leveldb.c_leveldb import c_leveldb +from dissect.database.leveldb.leveldb import LdbFile, LevelDB, LogFile +from tests._util import absolute_path + + +def test_leveldb_log_file_full() -> None: + """Test if we can parse a Google Chrome LevelDB log file with a single full block, + created by an IndexedDB serializer.""" + + file = absolute_path("_data/leveldb/indexeddb/simple/https_mdn.github.io_0.indexeddb.leveldb/000003.log") + log_file = LogFile(fh=file.open("rb")) + + blocks = log_file.blocks + records = list(log_file.records) + + assert len(blocks) == 34 + assert len(records) == 284 + + assert records[0].state == 1 + assert records[0].key == bytes.fromhex("000000003200") + assert records[0].value == bytes.fromhex("0801") + + assert records[-1].state == 0 + assert records[-1].key == bytes.fromhex("00000000320127") + assert records[-1].value == b"" + + +def test_leveldb_log_file_start_middle_end() -> None: + """Test if we can parse a larger LevelDB log file with segmented blocks (start-middle-end).""" + + file = absolute_path("_data/leveldb/indexeddb/segmented/https_mdn.github.io_0.indexeddb.leveldb/000003.log") + log_file = LogFile(fh=file.open("rb")) + + records = list(log_file.records) + num_records = len(records) + + assert num_records == 1057 + + +def test_leveldb_ldb_file() -> None: + """Test if we can parse a LevelDB ldb file.""" + + file = absolute_path("_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb/000005.ldb") + ldb = LdbFile(fh=file.open("rb")) + records = list(ldb.records) + + assert len(records) == 42983 + + assert records[-1].state == c_leveldb.RecordState.LIVE + assert records[-1].sequence == 29241 + assert records[-1].key == bytes.fromhex( + "0001011f0402011e00560061006e002d0077006f007200740068006c00650079" + "00560061006e002d0077006f007200740068006c00650079004d00610063004c" + "0065002d011500430068007200690073002d0061006e00740068006f006e0079" + "00530061006d002d00420065006e002d00037c96d958cf957942" + ) + assert records[-1].metadata == bytes.fromhex("0139720000000000") + assert records[-1].value == bytes.fromhex("c11c037c96d958cf957942") + + +def test_leveldb_dir_parsing() -> None: + """Test if we find all LevelDB log files and ldb files in a directory.""" + + leveldb = LevelDB(absolute_path("_data/leveldb/indexeddb/larger/file__0.indexeddb.leveldb")) + + assert len(leveldb.manifests) == 1 + assert len(leveldb.ldb_files) == 2 + assert len(leveldb.log_files) == 1 + + records = list(leveldb.records) + num_records = len(records) + assert num_records == 120085 diff --git a/tox.ini b/tox.ini index 284a4ba..5e3a6d5 100755 --- a/tox.ini +++ b/tox.ini @@ -22,6 +22,16 @@ commands = coverage report coverage xml +[testenv:benchmark] +deps = + pytest-benchmark + pytest-codspeed +dependency_groups = test +passenv = + CODSPEED_ENV +commands = + pytest --basetemp="{envtmpdir}" -m benchmark {posargs:--color=yes -v tests} + [testenv:build] package = skip dependency_groups = build