diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 4de4f58e1..1b76db26c 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -4,78 +4,71 @@ site_name: DataJoint Documentation repo_url: https://github.com/datajoint/datajoint-python repo_name: datajoint/datajoint-python nav: - - DataJoint Python: index.md - - Quick Start Guide: quick-start.md + - Home: index.md + - Quick Start: quick-start.md - Concepts: - - Principles: concepts/principles.md - - Data Model: concepts/data-model.md - - Data Pipelines: concepts/data-pipelines.md - - Teamwork: concepts/teamwork.md + - concepts/index.md - Terminology: concepts/terminology.md - - System Administration: - - Database Administration: sysadmin/database-admin.md - - Bulk Storage Systems: sysadmin/bulk-storage.md - - External Store: sysadmin/external-store.md - - Client Configuration: - - Install: client/install.md - - Credentials: client/credentials.md - - Settings: client/settings.md - - File Stores: client/stores.md + - Getting Started: + - Installation: client/install.md + - Connection: client/credentials.md + - Configuration: client/settings.md - Schema Design: - - Schema Creation: design/schema.md - - Table Definition: - - Table Tiers: design/tables/tiers.md - - Declaration Syntax: design/tables/declare.md - - Primary Key: design/tables/primary.md - - Attributes: design/tables/attributes.md - - Lookup Tables: design/tables/lookup.md - - Manual Tables: design/tables/manual.md - - Blobs: design/tables/blobs.md - - Attachments: design/tables/attach.md - - Filepaths: design/tables/filepath.md - - Custom Datatypes: design/tables/customtype.md - - Dependencies: design/tables/dependencies.md - - Indexes: design/tables/indexes.md - - Master-Part Relationships: design/tables/master-part.md - - Schema Diagrams: design/diagrams.md - - Entity Normalization: design/normalization.md - - Data Integrity: design/integrity.md - - Schema Recall: design/recall.md - - Schema Drop: design/drop.md - - Schema Modification: design/alter.md - - Data Manipulations: - - manipulation/index.md - - Insert: manipulation/insert.md - - Delete: manipulation/delete.md - - Update: manipulation/update.md - - Transactions: manipulation/transactions.md - - Data Queries: - - Principles: query/principles.md - - Example Schema: query/example-schema.md + - Schemas: design/schema.md + - Table Tiers: design/tables/tiers.md + - Declaration: design/tables/declare.md + - Primary Key: design/tables/primary.md + - Attributes: design/tables/attributes.md + - Foreign Keys: design/tables/dependencies.md + - Indexes: design/tables/indexes.md + - Lookup Tables: design/tables/lookup.md + - Manual Tables: design/tables/manual.md + - Master-Part: design/tables/master-part.md + - Diagrams: design/diagrams.md + - Alter: design/alter.md + - Drop: design/drop.md + - Data Types: + - Blob: datatypes/blob.md + - Attach: datatypes/attach.md + - Filepath: datatypes/filepath.md + - Object: datatypes/object.md + - Adapted Types: datatypes/adapters.md + - Data Operations: + - operations/index.md + - Insert: operations/insert.md + - Delete: operations/delete.md + - Update: operations/update.md + - Transactions: operations/transactions.md + - Make Method: operations/make.md + - Populate: operations/populate.md + - Key Source: operations/key-source.md + - Jobs: operations/jobs.md + - Distributed: operations/distributed.md + - Queries: + - query/principles.md - Fetch: query/fetch.md - - Iteration: query/iteration.md - Operators: query/operators.md - Restrict: query/restrict.md - - Projection: query/project.md + - Project: query/project.md - Join: query/join.md - Aggregation: query/aggregation.md - Union: query/union.md - Universal Sets: query/universals.md - - Query Caching: query/query-caching.md - - Computations: - - Make Method: compute/make.md - - Populate: compute/populate.md - - Key Source: compute/key-source.md - - Distributed Computing: compute/distributed.md - - Publish Data: publish-data.md - - Internals: - - SQL Transpilation: internal/transpilation.md + - Iteration: query/iteration.md + - Caching: query/query-caching.md + - Administration: + - Database: admin/database.md + - Storage Backends: admin/storage.md + - External Store: admin/external-store.md - Tutorials: - - JSON Datatype: tutorials/json.ipynb - - FAQ: faq.md - - Developer Guide: develop.md - - Citation: citation.md - - Changelog: changelog.md + - JSON Datatype: tutorials/json.ipynb + - Reference: + - FAQ: reference/faq.md + - SQL Transpilation: reference/transpilation.md + - Publishing Data: reference/publish-data.md + - Developer Guide: reference/develop.md + - Citation: reference/citation.md + - Changelog: changelog.md - API: api/ # defer to gen-files + literate-nav # ---------------------------- STANDARD ----------------------------- diff --git a/docs/src/sysadmin/database-admin.md b/docs/src/admin/database.md similarity index 100% rename from docs/src/sysadmin/database-admin.md rename to docs/src/admin/database.md diff --git a/docs/src/sysadmin/external-store.md b/docs/src/admin/external-store.md similarity index 98% rename from docs/src/sysadmin/external-store.md rename to docs/src/admin/external-store.md index aac61fe24..c956101a2 100644 --- a/docs/src/sysadmin/external-store.md +++ b/docs/src/admin/external-store.md @@ -34,7 +34,7 @@ For example, the following table stores motion-aligned two-photon movies. aligned_movie : blob@external # motion-aligned movie in 'external' store ``` -All [insert](../manipulation/insert.md) and [fetch](../query/fetch.md) operations work +All [insert](../operations/insert.md) and [fetch](../query/fetch.md) operations work identically for `external` attributes as they do for `blob` attributes, with the same serialization protocol. Similar to `blobs`, `external` attributes cannot be used in restriction conditions. @@ -116,12 +116,12 @@ configured external store. [foreign keys](../design/tables/dependencies.md) referencing the `~external_` table (but are not shown as such to the user). -8. The [insert](../manipulation/insert.md) operation encodes and hashes the blob data. +8. The [insert](../operations/insert.md) operation encodes and hashes the blob data. If an external object is not present in storage for the same hash, the object is saved and if the save operation is successful, corresponding entities in table `~external_` for that store are created. -9. The [delete](../manipulation/delete.md) operation first deletes the foreign key +9. The [delete](../operations/delete.md) operation first deletes the foreign key reference in the target table. The external table entry and actual external object is not actually deleted at this time (`soft-delete`). diff --git a/docs/src/sysadmin/bulk-storage.md b/docs/src/admin/storage.md similarity index 100% rename from docs/src/sysadmin/bulk-storage.md rename to docs/src/admin/storage.md diff --git a/docs/src/client/settings.md b/docs/src/client/settings.md index a05369bb9..cad1176dd 100644 --- a/docs/src/client/settings.md +++ b/docs/src/client/settings.md @@ -152,7 +152,7 @@ dj.config.database.use_tls = None # Auto (default) ## External Storage -Configure external stores in the `stores` section. See [External Storage](../sysadmin/external-store.md) for details. +Configure external stores in the `stores` section. See [External Storage](../admin/external-store.md) for details. ```json { @@ -164,3 +164,57 @@ Configure external stores in the `stores` section. See [External Storage](../sys } } ``` + +## Object Storage + +Configure object storage for the [`object` type](../design/tables/object.md) in the `object_storage` section. This provides managed file and folder storage with fsspec backend support. + +### Local Filesystem + +```json +{ + "object_storage": { + "project_name": "my_project", + "protocol": "file", + "location": "/data/my_project" + } +} +``` + +### Amazon S3 + +```json +{ + "object_storage": { + "project_name": "my_project", + "protocol": "s3", + "bucket": "my-bucket", + "location": "my_project", + "endpoint": "s3.amazonaws.com" + } +} +``` + +### Object Storage Settings + +| Setting | Environment Variable | Required | Description | +|---------|---------------------|----------|-------------| +| `object_storage.project_name` | `DJ_OBJECT_STORAGE_PROJECT_NAME` | Yes | Unique project identifier | +| `object_storage.protocol` | `DJ_OBJECT_STORAGE_PROTOCOL` | Yes | Backend: `file`, `s3`, `gcs`, `azure` | +| `object_storage.location` | `DJ_OBJECT_STORAGE_LOCATION` | Yes | Base path or bucket prefix | +| `object_storage.bucket` | `DJ_OBJECT_STORAGE_BUCKET` | For cloud | Bucket name | +| `object_storage.endpoint` | `DJ_OBJECT_STORAGE_ENDPOINT` | For S3 | S3 endpoint URL | +| `object_storage.partition_pattern` | `DJ_OBJECT_STORAGE_PARTITION_PATTERN` | No | Path pattern with `{attr}` placeholders | +| `object_storage.token_length` | `DJ_OBJECT_STORAGE_TOKEN_LENGTH` | No | Random suffix length (default: 8) | +| `object_storage.access_key` | — | For cloud | Access key (use secrets) | +| `object_storage.secret_key` | — | For cloud | Secret key (use secrets) | + +### Object Storage Secrets + +Store cloud credentials in the secrets directory: + +``` +.secrets/ +├── object_storage.access_key +└── object_storage.secret_key +``` diff --git a/docs/src/compute/populate.md b/docs/src/compute/populate.md deleted file mode 100644 index 45c863f17..000000000 --- a/docs/src/compute/populate.md +++ /dev/null @@ -1,317 +0,0 @@ -# Auto-populate - -Auto-populated tables are used to define, execute, and coordinate computations in a -DataJoint pipeline. - -Tables in the initial portions of the pipeline are populated from outside the pipeline. -In subsequent steps, computations are performed automatically by the DataJoint pipeline -in auto-populated tables. - -Computed tables belong to one of the two auto-populated -[data tiers](../design/tables/tiers.md): `dj.Imported` and `dj.Computed`. -DataJoint does not enforce the distinction between imported and computed tables: the -difference is purely semantic, a convention for developers to follow. -If populating a table requires access to external files such as raw storage that is not -part of the database, the table is designated as **imported**. -Otherwise it is **computed**. - -Auto-populated tables are defined and queried exactly as other tables. -(See [Manual Tables](../design/tables/manual.md).) -Their data definition follows the same [definition syntax](../design/tables/declare.md). - -## Make - -For auto-populated tables, data should never be entered using -[insert](../manipulation/insert.md) directly. -Instead these tables must define the callback method `make(self, key)`. -The `insert` method then can only be called on `self` inside this callback method. - -Imagine that there is a table `test.Image` that contains 2D grayscale images in its -`image` attribute. -Let us define the computed table, `test.FilteredImage` that filters the image in some -way and saves the result in its `filtered_image` attribute. - -The class will be defined as follows. - -```python -@schema -class FilteredImage(dj.Computed): - definition = """ - # Filtered image - -> Image - --- - filtered_image : longblob - """ - - def make(self, key): - img = (test.Image & key).fetch1('image') - key['filtered_image'] = myfilter(img) - self.insert1(key) -``` - -The `make` method receives one argument: the dict `key` containing the primary key -value of an element of [key source](key-source.md) to be worked on. - -The key represents the partially filled entity, usually already containing the -[primary key](../design/tables/primary.md) attributes of the key source. - -The `make` callback does three things: - -1. [Fetches](../query/fetch.md) data from tables upstream in the pipeline using the -`key` for [restriction](../query/restrict.md). -2. Computes and adds any missing attributes to the fields already in `key`. -3. Inserts the entire entity into `self`. - -A single `make` call may populate multiple entities when `key` does not specify the -entire primary key of the populated table, when the definition adds new attributes to the primary key. -This design is uncommon and not recommended. -The standard practice for autopopulated tables is to have its primary key composed of -foreign keys pointing to parent tables. - -### Three-Part Make Pattern for Long Computations - -For long-running computations, DataJoint provides an advanced pattern called the -**three-part make** that separates the `make` method into three distinct phases. -This pattern is essential for maintaining database performance and data integrity -during expensive computations. - -#### The Problem: Long Transactions - -Traditional `make` methods perform all operations within a single database transaction: - -```python -def make(self, key): - # All within one transaction - data = (ParentTable & key).fetch1() # Fetch - result = expensive_computation(data) # Compute (could take hours) - self.insert1(dict(key, result=result)) # Insert -``` - -This approach has significant limitations: -- **Database locks**: Long transactions hold locks on tables, blocking other operations -- **Connection timeouts**: Database connections may timeout during long computations -- **Memory pressure**: All fetched data must remain in memory throughout the computation -- **Failure recovery**: If computation fails, the entire transaction is rolled back - -#### The Solution: Three-Part Make Pattern - -The three-part make pattern splits the `make` method into three distinct phases, -allowing the expensive computation to occur outside of database transactions: - -```python -def make_fetch(self, key): - """Phase 1: Fetch all required data from parent tables""" - fetched_data = ((ParentTable & key).fetch1(),) - return fetched_data # must be a sequence, eg tuple or list - -def make_compute(self, key, *fetched_data): - """Phase 2: Perform expensive computation (outside transaction)""" - computed_result = expensive_computation(*fetched_data) - return computed_result # must be a sequence, eg tuple or list - -def make_insert(self, key, *computed_result): - """Phase 3: Insert results into the current table""" - self.insert1(dict(key, result=computed_result)) -``` - -#### Execution Flow - -To achieve data intensity without long transactions, the three-part make pattern follows this sophisticated execution sequence: - -```python -# Step 1: Fetch data outside transaction -fetched_data1 = self.make_fetch(key) -computed_result = self.make_compute(key, *fetched_data1) - -# Step 2: Begin transaction and verify data consistency -begin transaction: - fetched_data2 = self.make_fetch(key) - if fetched_data1 != fetched_data2: # deep comparison - cancel transaction # Data changed during computation - else: - self.make_insert(key, *computed_result) - commit_transaction -``` - -#### Key Benefits - -1. **Reduced Database Lock Time**: Only the fetch and insert operations occur within transactions, minimizing lock duration -2. **Connection Efficiency**: Database connections are only used briefly for data transfer -3. **Memory Management**: Fetched data can be processed and released during computation -4. **Fault Tolerance**: Computation failures don't affect database state -5. **Scalability**: Multiple computations can run concurrently without database contention - -#### Referential Integrity Protection - -The pattern includes a critical safety mechanism: **referential integrity verification**. -Before inserting results, the system: - -1. Re-fetches the source data within the transaction -2. Compares it with the originally fetched data using deep hashing -3. Only proceeds with insertion if the data hasn't changed - -This prevents the "phantom read" problem where source data changes during long computations, -ensuring that results remain consistent with their inputs. - -#### Implementation Details - -The pattern is implemented using Python generators in the `AutoPopulate` class: - -```python -def make(self, key): - # Step 1: Fetch data from parent tables - fetched_data = self.make_fetch(key) - computed_result = yield fetched_data - - # Step 2: Compute if not provided - if computed_result is None: - computed_result = self.make_compute(key, *fetched_data) - yield computed_result - - # Step 3: Insert the computed result - self.make_insert(key, *computed_result) - yield -``` -Therefore, it is possible to override the `make` method to implement the three-part make pattern by using the `yield` statement to return the fetched data and computed result as above. - -#### Use Cases - -This pattern is particularly valuable for: - -- **Machine learning model training**: Hours-long training sessions -- **Image processing pipelines**: Large-scale image analysis -- **Statistical computations**: Complex statistical analyses -- **Data transformations**: ETL processes with heavy computation -- **Simulation runs**: Time-consuming simulations - -#### Example: Long-Running Image Analysis - -Here's an example of how to implement the three-part make pattern for a -long-running image analysis task: - -```python -@schema -class ImageAnalysis(dj.Computed): - definition = """ - # Complex image analysis results - -> Image - --- - analysis_result : longblob - processing_time : float - """ - - def make_fetch(self, key): - """Fetch the image data needed for analysis""" - return (Image & key).fetch1('image'), - - def make_compute(self, key, image_data): - """Perform expensive image analysis outside transaction""" - import time - start_time = time.time() - - # Expensive computation that could take hours - result = complex_image_analysis(image_data) - processing_time = time.time() - start_time - return result, processing_time - - def make_insert(self, key, analysis_result, processing_time): - """Insert the analysis results""" - self.insert1(dict(key, - analysis_result=analysis_result, - processing_time=processing_time)) -``` - -The exact same effect may be achieved by overriding the `make` method as a generator function using the `yield` statement to return the fetched data and computed result as above: - -```python -@schema -class ImageAnalysis(dj.Computed): - definition = """ - # Complex image analysis results - -> Image - --- - analysis_result : longblob - processing_time : float - """ - - def make(self, key): - image_data = (Image & key).fetch1('image') - computed_result = yield (image_data, ) # pack fetched_data - - if computed_result is None: - # Expensive computation that could take hours - import time - start_time = time.time() - result = complex_image_analysis(image_data) - processing_time = time.time() - start_time - computed_result = result, processing_time #pack - yield computed_result - - result, processing_time = computed_result # unpack - self.insert1(dict(key, - analysis_result=result, - processing_time=processing_time)) - yield # yield control back to the caller -``` -We expect that most users will prefer to use the three-part implementation over the generator function implementation due to its conceptual complexity. - -## Populate - -The inherited `populate` method of `dj.Imported` and `dj.Computed` automatically calls -`make` for every key for which the auto-populated table is missing data. - -The `FilteredImage` table can be populated as - -```python -FilteredImage.populate() -``` - -The progress of long-running calls to `populate()` in datajoint-python can be -visualized by adding the `display_progress=True` argument to the populate call. - -Note that it is not necessary to specify which data needs to be computed. -DataJoint will call `make`, one-by-one, for every key in `Image` for which -`FilteredImage` has not yet been computed. - -Chains of auto-populated tables form computational pipelines in DataJoint. - -## Populate options - -The `populate` method accepts a number of optional arguments that provide more features -and allow greater control over the method's behavior. - -- `restrictions` - A list of restrictions, restricting as -`(tab.key_source & AndList(restrictions)) - tab.proj()`. - Here `target` is the table to be populated, usually `tab` itself. -- `suppress_errors` - If `True`, encountering an error will cancel the current `make` -call, log the error, and continue to the next `make` call. - Error messages will be logged in the job reservation table (if `reserve_jobs` is - `True`) and returned as a list. - See also `return_exception_objects` and `reserve_jobs`. - Defaults to `False`. -- `return_exception_objects` - If `True`, error objects are returned instead of error - messages. - This applies only when `suppress_errors` is `True`. - Defaults to `False`. -- `reserve_jobs` - If `True`, reserves job to indicate to other distributed processes. - The job reservation table may be access as `schema.jobs`. - Errors are logged in the jobs table. - Defaults to `False`. -- `order` - The order of execution, either `"original"`, `"reverse"`, or `"random"`. - Defaults to `"original"`. -- `display_progress` - If `True`, displays a progress bar. - Defaults to `False`. -- `limit` - If not `None`, checks at most this number of keys. - Defaults to `None`. -- `max_calls` - If not `None`, populates at most this many keys. - Defaults to `None`, which means no limit. - -## Progress - -The method `table.progress` reports how many `key_source` entries have been populated -and how many remain. -Two optional parameters allow more advanced use of the method. -A parameter of restriction conditions can be provided, specifying which entities to -consider. -A Boolean parameter `display` (default is `True`) allows disabling the output, such -that the numbers of remaining and total entities are returned but not printed. diff --git a/docs/src/concepts/data-model.md b/docs/src/concepts/data-model.md deleted file mode 100644 index 90460361a..000000000 --- a/docs/src/concepts/data-model.md +++ /dev/null @@ -1,172 +0,0 @@ -# Data Model - -## What is a data model? - -A **data model** is a conceptual framework that defines how data is organized, -represented, and transformed. It gives us the components for creating blueprints for the -structure and operations of data management systems, ensuring consistency and efficiency -in data handling. - -Data management systems are built to accommodate these models, allowing us to manage -data according to the principles laid out by the model. If you’re studying data science -or engineering, you’ve likely encountered different data models, each providing a unique -approach to organizing and manipulating data. - -A data model is defined by considering the following key aspects: - -+ What are the fundamental elements used to structure the data? -+ What operations are available for defining, creating, and manipulating the data? -+ What mechanisms exist to enforce the structure and rules governing valid data interactions? - -## Types of data models - -Among the most familiar data models are those based on files and folders: data of any -kind are lumped together into binary strings called **files**, files are collected into -folders, and folders can be nested within other folders to create a folder hierarchy. - -Another family of data models are various **tabular models**. -For example, items in CSV files are listed in rows, and the attributes of each item are -stored in columns. -Various **spreadsheet** models allow forming dependencies between cells and groups of -cells, including complex calculations. - -The **object data model** is common in programming, where data are represented as -objects in memory with properties and methods for transformations of such data. - -## Relational data model - -The **relational model** is a way of thinking about data as sets and operations on sets. -Formalized almost a half-century ago ([Codd, -1969](https://dl.acm.org/citation.cfm?doid=362384.362685)). The relational data model is -one of the most powerful and precise ways to store and manage structured data. At its -core, this model organizes all data into tables--representing mathematical -relations---where each table consists of rows (representing mathematical tuples) and -columns (often called attributes). - -### Core principles of the relational data model - -**Data representation:** - Data are represented and manipulated in the form of relations. - A relation is a set (i.e. an unordered collection) of entities of values for each of - the respective named attributes of the relation. - Base relations represent stored data while derived relations are formed from base - relations through query expressions. - A collection of base relations with their attributes, domain constraints, uniqueness - constraints, and referential constraints is called a schema. - -**Domain constraints:** - Each attribute (column) in a table is associated with a specific attribute domain (or - datatype, a set of possible values), ensuring that the data entered is valid. - Attribute domains may not include relations, which keeps the data model - flat, i.e. free of nested structures. - -**Uniqueness constraints:** - Entities within relations are addressed by values of their attributes. - To identify and relate data elements, uniqueness constraints are imposed on subsets - of attributes. - Such subsets are then referred to as keys. - One key in a relation is designated as the primary key used for referencing its elements. - -**Referential constraints:** - Associations among data are established by means of referential constraints with the - help of foreign keys. - A referential constraint on relation A referencing relation B allows only those - entities in A whose foreign key attributes match the key attributes of an entity in B. - -**Declarative queries:** - Data queries are formulated through declarative, as opposed to imperative, - specifications of sought results. - This means that query expressions convey the logic for the result rather than the - procedure for obtaining it. - Formal languages for query expressions include relational algebra, relational - calculus, and SQL. - -The relational model has many advantages over both hierarchical file systems and -tabular models for maintaining data integrity and providing flexible access to -interesting subsets of the data. - -Popular implementations of the relational data model rely on the Structured Query -Language (SQL). -SQL comprises distinct sublanguages for schema definition, data manipulation, and data -queries. -SQL thoroughly dominates in the space of relational databases and is often conflated -with the relational data model in casual discourse. -Various terminologies are used to describe related concepts from the relational data -model. -Similar to spreadsheets, relations are often visualized as tables with *attributes* -corresponding to *columns* and *entities* corresponding to *rows*. -In particular, SQL uses the terms *table*, *column*, and *row*. - -## The DataJoint Model - -DataJoint is a conceptual refinement of the relational data model offering a more -expressive and rigorous framework for database programming ([Yatsenko et al., -2018](https://arxiv.org/abs/1807.11104)). The DataJoint model facilitates conceptual -clarity, efficiency, workflow management, and precise and flexible data -queries. By enforcing entity normalization, -simplifying dependency declarations, offering a rich query algebra, and visualizing -relationships through schema diagrams, DataJoint makes relational database programming -more intuitive and robust for complex data pipelines. - -The model has emerged over a decade of continuous development of complex data -pipelines for neuroscience experiments ([Yatsenko et al., -2015](https://www.biorxiv.org/content/early/2015/11/14/031658)). DataJoint has allowed -researchers with no prior knowledge of databases to collaborate effectively on common -data pipelines sustaining data integrity and supporting flexible access. DataJoint is -currently implemented as client libraries in MATLAB and Python. These libraries work by -transpiling DataJoint queries into SQL before passing them on to conventional relational -database systems that serve as the backend, in combination with bulk storage systems for -storing large contiguous data objects. - -DataJoint comprises: - -+ a schema [definition](../design/tables/declare.md) language -+ a data [manipulation](../manipulation/index.md) language -+ a data [query](../query/principles.md) language -+ a [diagramming](../design/diagrams.md) notation for visualizing relationships between -modeled entities - -The key refinement of DataJoint over other relational data models and their -implementations is DataJoint's support of -[entity normalization](../design/normalization.md). - -### Core principles of the DataJoint model - -**Entity Normalization** - DataJoint enforces entity normalization, ensuring that every entity set (table) is - well-defined, with each element belonging to the same type, sharing the same - attributes, and distinguished by the same primary key. This principle reduces - redundancy and avoids data anomalies, similar to Boyce-Codd Normal Form, but with a - more intuitive structure than traditional SQL. - -**Simplified Schema Definition and Dependency Management** - DataJoint introduces a schema definition language that is more expressive and less - error-prone than SQL. Dependencies are explicitly declared using arrow notation - (->), making referential constraints easier to understand and visualize. The - dependency structure is enforced as an acyclic directed graph, which simplifies - workflows by preventing circular dependencies. - -**Integrated Query Operators producing a Relational Algebra** - DataJoint introduces five query operators (restrict, join, project, aggregate, and - union) with algebraic closure, allowing them to be combined seamlessly. These - operators are designed to maintain operational entity normalization, ensuring query - outputs remain valid entity sets. - -**Diagramming Notation for Conceptual Clarity** - DataJoint’s schema diagrams simplify the representation of relationships between - entity sets compared to ERM diagrams. Relationships are expressed as dependencies - between entity sets, which are visualized using solid or dashed lines for primary - and secondary dependencies, respectively. - -**Unified Logic for Binary Operators** - DataJoint simplifies binary operations by requiring attributes involved in joins or - comparisons to be homologous (i.e., sharing the same origin). This avoids the - ambiguity and pitfalls of natural joins in SQL, ensuring more predictable query - results. - -**Optimized Data Pipelines for Scientific Workflows** - DataJoint treats the database as a data pipeline where each entity set defines a - step in the workflow. This makes it ideal for scientific experiments and complex - data processing, such as in neuroscience. Its MATLAB and Python libraries transpile - DataJoint queries into SQL, bridging the gap between scientific programming and - relational databases. diff --git a/docs/src/concepts/data-pipelines.md b/docs/src/concepts/data-pipelines.md deleted file mode 100644 index cf20b075b..000000000 --- a/docs/src/concepts/data-pipelines.md +++ /dev/null @@ -1,166 +0,0 @@ -# Data Pipelines - -## What is a data pipeline? - -A scientific **data pipeline** is a collection of processes and systems for organizing -the data, computations, and workflows used by a research group as they jointly perform -complex sequences of data acquisition, processing, and analysis. - -A variety of tools can be used for supporting shared data pipelines: - -Data repositories - Research teams set up a shared **data repository**. - This minimal data management tool allows depositing and retrieving data and managing - user access. - For example, this may include a collection of files with standard naming conventions - organized into folders and sub-folders. - Or a data repository might reside on the cloud, for example in a collection of S3 - buckets. - This image of data management -- where files are warehoused and retrieved from a - hierarchically-organized system of folders -- is an approach that is likely familiar - to most scientists. - -Database systems - **Databases** are a form of data repository providing additional capabilities: - - 1. Defining, communicating, and enforcing structure in the stored data. - 2. Maintaining data integrity: correct identification of data and consistent cross-references, dependencies, and groupings among the data. - 3. Supporting queries that retrieve various cross-sections and transformation of the deposited data. - - Most scientists have some familiarity with these concepts, for example the notion of maintaining consistency between data and the metadata that describes it, or applying a filter to an Excel spreadsheet to retrieve specific subsets of information. - However, usually the more advanced concepts involved in building and using relational databases fall under the specific expertise of data scientists. - -Data pipelines - **Data pipeline** frameworks may include all the features of a database system along - with additional functionality: - - 1. Integrating computations to perform analyses and manage intermediate results in a principled way. - 2. Supporting distributed computations without conflict. - 3. Defining, communicating, and enforcing **workflow**, making clear the sequence of steps that must be performed for data entry, acquisition, and processing. - - Again, the informal notion of an analysis "workflow" will be familiar to most scientists, along with the logistical difficulties associated with managing a workflow that is shared by multiple scientists within or across labs. - - Therefore, a full-featured data pipeline framework may also be described as a [scientific workflow system](https://en.wikipedia.org/wiki/Scientific_workflow_system). - -Major features of data management frameworks: data repositories, databases, and data pipelines. - -![data pipelines vs databases vs data repositories](../images/pipeline-database.png){: style="align:center"} - -## What is DataJoint? - -DataJoint is a free open-source framework for creating scientific data pipelines -directly from MATLAB or Python (or any mixture of the two). -The data are stored in a language-independent way that allows interoperability between -MATLAB and Python, with additional languages in the works. -DataJoint pipelines become the central tool in the operations of data-intensive labs or -consortia as they organize participants with different roles and skills around a common -framework. - -In DataJoint, a data pipeline is a sequence of steps (more generally, a directed -acyclic graph) with integrated data storage at each step. -The pipeline may have some nodes requiring manual data entry or import from external -sources, some that read from raw data files, and some that perform computations on data -stored in other database nodes. -In a typical scenario, experimenters and acquisition instruments feed data into nodes -at the head of the pipeline, while downstream nodes perform automated computations for -data processing and analysis. - -For example, this is the pipeline for a simple mouse experiment involving calcium -imaging in mice. - -![A data pipeline](../images/pipeline.png){: style="width:250px; align:center"} - -In this example, the experimenter first enters information about a mouse, then enters -information about each imaging session in that mouse, and then each scan performed in -each imaging session. -Next the automated portion of the pipeline takes over to import the raw imaging data, -perform image alignment to compensate for motion, image segmentation to identify cells -in the images, and extraction of calcium traces. -Finally, the receptive field (RF) computation is performed by relating the calcium -signals to the visual stimulus information. - -## How DataJoint works - -DataJoint enables data scientists to build and operate scientific data pipelines. - -Conceptual overview of DataJoint operation. - -![DataJoint operation](../images/how-it-works.png){: style="align:center"} - -DataJoint provides a simple and powerful data model, which is detailed more formally in [Yatsenko D, Walker EY, Tolias AS (2018). DataJoint: A Simpler Relational Data Model.](https://arxiv.org/abs/1807.11104). -Put most generally, a "data model" defines how to think about data and the operations -that can be performed on them. -DataJoint's model is a refinement of the relational data model: all nodes in the -pipeline are simple tables storing data, tables are related by their shared attributes, -and query operations can combine the contents of multiple tables. -DataJoint enforces specific constraints on the relationships between tables that help -maintain data integrity and enable flexible access. -DataJoint uses a succinct data definition language, a powerful data query language, and -expressive visualizations of the pipeline. -A well-defined and principled approach to data organization and computation enables -teams of scientists to work together efficiently. -The data become immediately available to all participants with appropriate access privileges. -Some of the "participants" may be computational agents that perform processing and -analysis, and so DataJoint features a built-in distributed job management process to -allow distributing analysis between any number of computers. - -From a practical point of view, the back-end data architecture may vary depending on -project requirements. -Typically, the data architecture includes a relational database server (e.g. MySQL) and -a bulk data storage system (e.g. [AWS S3](https://aws.amazon.com/s3/) or a filesystem). -However, users need not interact with the database directly, but via MATLAB or Python -objects that are each associated with an individual table in the database. -One of the main advantages of this approach is that DataJoint clearly separates the -data model facing the user from the data architecture implementing data management and -computing. DataJoint works well in combination with good code sharing (e.g. with -[git](https://git-scm.com/)) and environment sharing (e.g. with -[Docker](https://www.docker.com/)). - -DataJoint is designed for quick prototyping and continuous exploration as experimental -designs change or evolve. -New analysis methods can be added or removed at any time, and the structure of the -workflow itself can change over time, for example as new data acquisition methods are -developed. - -With DataJoint, data sharing and publishing is no longer a separate step at the end of -the project. -Instead data sharing is an inherent feature of the process: to share data with other -collaborators or to publish the data to the world, one only needs to set the access -privileges. - -## Real-life example - -The [Mesoscale Activity Project](https://www.simonsfoundation.org/funded-project/%20multi-regional-neuronal-dynamics-of-memory-guided-flexible-behavior/) -(MAP) is a collaborative project between four neuroscience labs. -MAP uses DataJoint for data acquisition, processing, analysis, interfaces, and external sharing. - -The DataJoint pipeline for the MAP project. - -![A data pipeline for the MAP project](../images/map-dataflow.png){: style="align:center"} - -The pipeline is hosted in the cloud through [Amazon Web Services](https://aws.amazon.com/) (AWS). -MAP data scientists at the Janelia Research Campus and Baylor College of Medicine -defined the data pipeline. -Experimental scientists enter manual data directly into the pipeline using the -[Helium web interface](https://github.com/mattbdean/Helium). -The raw data are preprocessed using the DataJoint client libraries in MATLAB and Python; -the preprocessed data are ingested into the pipeline while the bulky and raw data are -shared using [Globus](https://globus.org) transfer through the -[PETREL](https://www.alcf.anl.gov/petrel) storage servers provided by the Argonne -National Lab. -Data are made immediately available for exploration and analysis to collaborating labs, -and the analysis results are also immediately shared. -Analysis data may be visualized through web interfaces. -Intermediate results may be exported into the [NWB](https://nwb.org) format for sharing -with external groups. - -## Summary of DataJoint features - -1. A free, open-source framework for scientific data pipelines and workflow management -2. Data hosting in cloud or in-house -3. MySQL, filesystems, S3, and Globus for data management -4. Define, visualize, and query data pipelines from MATLAB or Python -5. Enter and view data through GUIs -6. Concurrent access by multiple users and computational agents -7. Data integrity: identification, dependencies, groupings -8. Automated distributed computation diff --git a/docs/src/concepts/index.md b/docs/src/concepts/index.md new file mode 100644 index 000000000..b4f11c7cc --- /dev/null +++ b/docs/src/concepts/index.md @@ -0,0 +1,31 @@ +# Concepts + +DataJoint is a framework for scientific workflow management based on relational principles. +For comprehensive coverage of the underlying theory, see the [DataJoint Book](https://datajoint.github.io/datajoint-book). + +## Core Ideas + +**Tables as Entity Sets** +: All data are represented as tables where each row is an entity with the same set of attributes. A primary key uniquely identifies each entity. + +**Data Tiers** +: Tables are categorized by how their data originates: + +| Tier | Python Class | Data Origin | +|------|--------------|-------------| +| Lookup | `dj.Lookup` | Predefined contents (parameters, options) | +| Manual | `dj.Manual` | External entry (user input, ingestion scripts) | +| Imported | `dj.Imported` | Auto-populated from external sources | +| Computed | `dj.Computed` | Auto-populated from upstream tables | + +**Dependencies** +: Foreign keys define relationships between tables, enabling referential integrity and automatic cascading deletes. + +**Schemas** +: Tables are grouped into schemas (database namespaces). Each schema maps to a Python module. + +## Learn More + +- [DataJoint Book: Concepts](https://datajoint.github.io/datajoint-book) — Relational model, data integrity, pipelines +- [DataJoint Book: Design](https://datajoint.github.io/datajoint-book) — Schema design principles, normalization +- [Terminology](terminology.md) — Quick reference for DataJoint terms diff --git a/docs/src/concepts/principles.md b/docs/src/concepts/principles.md deleted file mode 100644 index 2bf491590..000000000 --- a/docs/src/concepts/principles.md +++ /dev/null @@ -1,136 +0,0 @@ -# Principles - -## Theoretical Foundations - -*DataJoint Core* implements a systematic framework for the joint management of -structured scientific data and its associated computations. -The framework builds on the theoretical foundations of the -[Relational Model](https://en.wikipedia.org/wiki/Relational_model) and -the [Entity-Relationship Model](https://en.wikipedia.org/wiki/Entity%E2%80%93relationship_model), -introducing a number of critical clarifications for the effective use of databases as -scientific data pipelines. -Notably, DataJoint introduces the concept of *computational dependencies* as a native -first-class citizen of the data model. -This integration of data structure and computation into a single model, defines a new -class of *computational scientific databases*. - -This page defines the key principles of this model without attachment to a specific -implementation while a more complete description of the model can be found in -[Yatsenko et al, 2018](https://doi.org/10.48550/arXiv.1807.11104). - -DataJoint developers are developing these principles into an -[open standard](https://en.wikipedia.org/wiki/Open_standard) to allow multiple -alternative implementations. - -## Data Representation - -### Tables = Entity Sets - -DataJoint uses only one data structure in all its operations—the *entity set*. - -1. All data are represented in the form of *entity sets*, i.e. an ordered collection of -*entities*. -2. All entities of an entity set belong to the same well-defined entity class and have -the same set of named attributes. -3. Attributes in an entity set has a *data type* (or *domain*), representing the set of -its valid values. -4. Each entity in an entity set provides the *attribute values* for all of the -attributes of its entity class. -5. Each entity set has a *primary key*, *i.e.* a subset of attributes that, jointly, -uniquely identify any entity in the set. - -These formal terms have more common (even if less precise) variants: - -| formal | common | -|:-:|:--:| -| entity set | *table* | -| attribute | *column* | -| attribute value | *field* | - -A collection of *stored tables* make up a *database*. -*Derived tables* are formed through *query expressions*. - -### Table Definition - -DataJoint introduces a streamlined syntax for defining a stored table. - -Each line in the definition defines an attribute with its name, data type, an optional -default value, and an optional comment in the format: - -```python -name [=default] : type [# comment] -``` - -Primary attributes come first and are separated from the rest of the attributes with -the divider `---`. - -For example, the following code defines the entity set for entities of class `Employee`: - -```python -employee_id : int ---- -ssn = null : int # optional social security number -date_of_birth : date -gender : enum('male', 'female', 'other') -home_address="" : varchar(1000) -primary_phone="" : varchar(12) -``` - -### Data Tiers - -Stored tables are designated into one of four *tiers* indicating how their data -originates. - -| table tier | data origin | -| --- | --- | -| lookup | contents are part of the table definition, defined *a priori* rather than entered externally. Typical stores general facts, parameters, options, *etc.* | -| manual | contents are populated by external mechanisms such as manual entry through web apps or by data ingest scripts | -| imported | contents are populated automatically by pipeline computations accessing data from upstream in the pipeline **and** from external data sources such as raw data stores.| -| computed | contents are populated automatically by pipeline computations accessing data from upstream in the pipeline. | - -### Object Serialization - -### Data Normalization - -A collection of data is considered normalized when organized into a collection of -entity sets, where each entity set represents a well-defined entity class with all its -attributes applicable to each entity in the set and the same primary key identifying - -The normalization procedure often includes splitting data from one table into several -tables, one for each proper entity set. - -### Databases and Schemas - -Stored tables are named and grouped into namespaces called *schemas*. -A collection of schemas make up a *database*. -A *database* has a globally unique address or name. -A *schema* has a unique name within its database. -Within a *connection* to a particular database, a stored table is identified as -`schema.Table`. -A schema typically groups tables that are logically related. - -## Dependencies - -Entity sets can form referential dependencies that express and - -### Diagramming - -## Data integrity - -### Entity integrity - -*Entity integrity* is the guarantee made by the data management process of the 1:1 -mapping between real-world entities and their digital representations. -In practice, entity integrity is ensured when it is made clear - -### Referential integrity - -### Group integrity - -## Data manipulations - -## Data queries - -### Query Operators - -## Pipeline computations diff --git a/docs/src/concepts/teamwork.md b/docs/src/concepts/teamwork.md deleted file mode 100644 index a0a782dde..000000000 --- a/docs/src/concepts/teamwork.md +++ /dev/null @@ -1,97 +0,0 @@ -# Teamwork - -## Data management in a science project - -Science labs organize their projects as a sequence of activities of experiment design, -data acquisition, and processing and analysis. - -![data science in a science lab](../images/data-science-before.png){: style="width:510px; display:block; margin: 0 auto;"} - -
Workflow and dataflow in a common findings-centered approach to data science in a science lab.
- -Many labs lack a uniform data management strategy that would span longitudinally across -the entire project lifecycle as well as laterally across different projects. - -Prior to publishing their findings, the research team may need to publish the data to -support their findings. -Without a data management system, this requires custom repackaging of the data to -conform to the [FAIR principles](https://www.nature.com/articles/sdata201618) for -scientific data management. - -## Data-centric project organization - -DataJoint is designed to support a data-centric approach to large science projects in -which data are viewed as a principal output of the research project and are managed -systematically throughout in a single framework through the entire process. - -This approach requires formulating a general data science plan and upfront investment -for setting up resources and processes and training the teams. -The team uses DataJoint to build data pipelines to support multiple projects. - -![data science in a science lab](../images/data-science-after.png){: style="width:510px; display:block; margin: 0 auto;"} - -
Workflow and dataflow in a data pipeline-centered approach.
- -Data pipelines support project data across their entire lifecycle, including the -following functions - -- experiment design -- animal colony management -- electronic lab book: manual data entry during experiments through graphical user interfaces. -- acquisition from instrumentation in the course of experiments -- ingest from raw acquired data -- computations for data analysis -- visualization of analysis results -- export for sharing and publishing - -Through all these activities, all these data are made accessible to all authorized -participants and distributed computations can be done in parallel without compromising -data integrity. - -## Team roles - -The adoption of a uniform data management framework allows separation of roles and -division of labor among team members, leading to greater efficiency and better scaling. - -![data science in a science lab](../images/data-engineering.png){: style="width:510px; display:block; margin: 0 auto;"} - -
Distinct responsibilities of data science and data engineering.
- -### Scientists - -Design and conduct experiments, collecting data. -They interact with the data pipeline through graphical user interfaces designed by -others. -They understand what analysis is used to test their hypotheses. - -### Data scientists - -Have the domain expertise and select and implement the processing and analysis -methods for experimental data. -Data scientists are in charge of defining and managing the data pipeline using -DataJoint's data model, but they may not know the details of the underlying -architecture. -They interact with the pipeline using client programming interfaces directly from -languages such as MATLAB and Python. - -The bulk of this manual is written for working data scientists, except for System -Administration. - -### Data engineers - -Work with the data scientists to support the data pipeline. -They rely on their understanding of the DataJoint data model to configure and -administer the required IT resources such as database servers, data storage -servers, networks, cloud instances, [Globus](https://globus.org) endpoints, etc. -Data engineers can provide general solutions such as web hosting, data publishing, -interfaces, exports and imports. - -The System Administration section of this tutorial contains materials helpful in -accomplishing these tasks. - -DataJoint is designed to delineate a clean boundary between **data science** and **data -engineering**. -This allows data scientists to use the same uniform data model for data pipelines -backed by a variety of information technologies. -This delineation also enables economies of scale as a single data engineering team can -support a wide spectrum of science projects. diff --git a/docs/src/datatypes/adapters.md b/docs/src/datatypes/adapters.md new file mode 100644 index 000000000..267e0420b --- /dev/null +++ b/docs/src/datatypes/adapters.md @@ -0,0 +1,614 @@ +# Custom Attribute Types + +In modern scientific research, data pipelines often involve complex workflows that +generate diverse data types. From high-dimensional imaging data to machine learning +models, these data types frequently exceed the basic representations supported by +traditional relational databases. For example: + ++ A lab working on neural connectivity might use graph objects to represent brain + networks. ++ Researchers processing raw imaging data might store custom objects for pre-processing + configurations. ++ Computational biologists might store fitted machine learning models or parameter + objects for downstream predictions. + +To handle these diverse needs, DataJoint provides the **AttributeType** system. It +enables researchers to store and retrieve complex, non-standard data types—like Python +objects or data structures—in a relational database while maintaining the +reproducibility, modularity, and query capabilities required for scientific workflows. + +## Overview + +Custom attribute types define bidirectional conversion between: + +- **Python objects** (what your code works with) +- **Storage format** (what gets stored in the database) + +``` +┌─────────────────┐ encode() ┌─────────────────┐ +│ Python Object │ ───────────────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. blob) │ +└─────────────────┘ decode() └─────────────────┘ + ◄─────────────── +``` + +## Defining Custom Types + +Create a custom type by subclassing `dj.AttributeType` and implementing the required +methods: + +```python +import datajoint as dj +import networkx as nx + +@dj.register_type +class GraphType(dj.AttributeType): + """Custom type for storing networkx graphs.""" + + # Required: unique identifier used in table definitions + type_name = "graph" + + # Required: underlying DataJoint storage type + dtype = "longblob" + + def encode(self, graph, *, key=None): + """Convert graph to storable format (called on INSERT).""" + return list(graph.edges) + + def decode(self, edges, *, key=None): + """Convert stored data back to graph (called on FETCH).""" + return nx.Graph(edges) +``` + +### Required Components + +| Component | Description | +|-----------|-------------| +| `type_name` | Unique identifier used in table definitions with `` syntax | +| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) | +| `encode(value, *, key=None)` | Converts Python object to storable format | +| `decode(stored, *, key=None)` | Converts stored data back to Python object | + +### Using Custom Types in Tables + +Once registered, use the type in table definitions with angle brackets: + +```python +@schema +class Connectivity(dj.Manual): + definition = """ + conn_id : int + --- + conn_graph = null : # Uses the GraphType we defined + """ +``` + +Insert and fetch work seamlessly: + +```python +import networkx as nx + +# Insert - encode() is called automatically +g = nx.lollipop_graph(4, 2) +Connectivity.insert1({"conn_id": 1, "conn_graph": g}) + +# Fetch - decode() is called automatically +result = (Connectivity & "conn_id = 1").fetch1("conn_graph") +assert isinstance(result, nx.Graph) +``` + +## Type Registration + +### Decorator Registration + +The simplest way to register a type is with the `@dj.register_type` decorator: + +```python +@dj.register_type +class MyType(dj.AttributeType): + type_name = "my_type" + ... +``` + +### Direct Registration + +You can also register types explicitly: + +```python +class MyType(dj.AttributeType): + type_name = "my_type" + ... + +dj.register_type(MyType) +``` + +### Listing Registered Types + +```python +# List all registered type names +print(dj.list_types()) +``` + +## Validation + +Add data validation by overriding the `validate()` method. It's called automatically +before `encode()` during INSERT operations: + +```python +@dj.register_type +class PositiveArrayType(dj.AttributeType): + type_name = "positive_array" + dtype = "longblob" + + def validate(self, value): + """Ensure all values are positive.""" + import numpy as np + if not isinstance(value, np.ndarray): + raise TypeError(f"Expected numpy array, got {type(value).__name__}") + if np.any(value < 0): + raise ValueError("Array must contain only positive values") + + def encode(self, array, *, key=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## Storage Types (dtype) + +The `dtype` property specifies how data is stored in the database: + +| dtype | Use Case | Stored Format | +|-------|----------|---------------| +| `"longblob"` | Complex Python objects, arrays | Serialized binary | +| `"blob"` | Smaller objects | Serialized binary | +| `"json"` | JSON-serializable data | JSON string | +| `"varchar(N)"` | String representations | Text | +| `"int"` | Integer identifiers | Integer | +| `"blob@store"` | Large objects in external storage | UUID reference | +| `"object"` | Files/folders in object storage | JSON metadata | +| `""` | Chain to another custom type | Varies | + +### External Storage + +For large data, use external blob storage: + +```python +@dj.register_type +class LargeArrayType(dj.AttributeType): + type_name = "large_array" + dtype = "blob@mystore" # Uses external store named "mystore" + + def encode(self, array, *, key=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## Type Chaining + +Custom types can build on other custom types by referencing them in `dtype`: + +```python +@dj.register_type +class CompressedGraphType(dj.AttributeType): + type_name = "compressed_graph" + dtype = "" # Chain to the GraphType + + def encode(self, graph, *, key=None): + # Compress before passing to GraphType + return self._compress(graph) + + def decode(self, stored, *, key=None): + # GraphType's decode already ran + return self._decompress(stored) +``` + +DataJoint automatically resolves the chain to find the final storage type. + +## The Key Parameter + +The `key` parameter provides access to primary key values during encode/decode +operations. This is useful when the conversion depends on record context: + +```python +@dj.register_type +class ContextAwareType(dj.AttributeType): + type_name = "context_aware" + dtype = "longblob" + + def encode(self, value, *, key=None): + if key and key.get("version") == 2: + return self._encode_v2(value) + return self._encode_v1(value) + + def decode(self, stored, *, key=None): + if key and key.get("version") == 2: + return self._decode_v2(stored) + return self._decode_v1(stored) +``` + +## Publishing Custom Types as Packages + +Custom types can be distributed as installable packages using Python entry points. +This allows types to be automatically discovered when the package is installed. + +### Package Structure + +``` +dj-graph-types/ +├── pyproject.toml +└── src/ + └── dj_graph_types/ + ├── __init__.py + └── types.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-types" +version = "1.0.0" + +[project.entry-points."datajoint.types"] +graph = "dj_graph_types.types:GraphType" +weighted_graph = "dj_graph_types.types:WeightedGraphType" +``` + +### Type Implementation + +```python +# src/dj_graph_types/types.py +import datajoint as dj +import networkx as nx + +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return list(graph.edges) + + def decode(self, edges, *, key=None): + return nx.Graph(edges) + +class WeightedGraphType(dj.AttributeType): + type_name = "weighted_graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return [(u, v, d) for u, v, d in graph.edges(data=True)] + + def decode(self, edges, *, key=None): + g = nx.Graph() + g.add_weighted_edges_from(edges) + return g +``` + +### Usage After Installation + +```bash +pip install dj-graph-types +``` + +```python +# Types are automatically available after package installation +@schema +class MyTable(dj.Manual): + definition = """ + id : int + --- + network : + weighted_network : + """ +``` + +## Complete Example + +Here's a complete example demonstrating custom types for a neuroscience workflow: + +```python +import datajoint as dj +import numpy as np + +# Configure DataJoint +dj.config["database.host"] = "localhost" +dj.config["database.user"] = "root" +dj.config["database.password"] = "password" + +# Define custom types +@dj.register_type +class SpikeTrainType(dj.AttributeType): + """Efficient storage for sparse spike timing data.""" + type_name = "spike_train" + dtype = "longblob" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) + + +@dj.register_type +class WaveformType(dj.AttributeType): + """Storage for spike waveform templates with metadata.""" + type_name = "waveform" + dtype = "longblob" + + def encode(self, waveform_dict, *, key=None): + return { + "data": waveform_dict["data"].astype(np.float32), + "sampling_rate": waveform_dict["sampling_rate"], + "channel_ids": list(waveform_dict["channel_ids"]), + } + + def decode(self, stored, *, key=None): + return { + "data": stored["data"].astype(np.float64), + "sampling_rate": stored["sampling_rate"], + "channel_ids": np.array(stored["channel_ids"]), + } + + +# Create schema and tables +schema = dj.schema("ephys_analysis") + +@schema +class Unit(dj.Manual): + definition = """ + unit_id : int + --- + spike_times : + waveform : + quality : enum('good', 'mua', 'noise') + """ + + +# Usage +spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) +waveform = { + "data": np.random.randn(82, 4), + "sampling_rate": 30000, + "channel_ids": [10, 11, 12, 13], +} + +Unit.insert1({ + "unit_id": 1, + "spike_times": spike_times, + "waveform": waveform, + "quality": "good", +}) + +# Fetch - automatically decoded +result = (Unit & "unit_id = 1").fetch1() +print(f"Spike times: {result['spike_times']}") +print(f"Waveform shape: {result['waveform']['data'].shape}") +``` + +## Migration from AttributeAdapter + +The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`: + +### Before (deprecated) + +```python +class GraphAdapter(dj.AttributeAdapter): + attribute_type = "longblob" + + def put(self, obj): + return list(obj.edges) + + def get(self, value): + return nx.Graph(value) + +# Required context-based registration +graph = GraphAdapter() +schema = dj.schema("mydb", context={"graph": graph}) +``` + +### After (recommended) + +```python +@dj.register_type +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, obj, *, key=None): + return list(obj.edges) + + def decode(self, value, *, key=None): + return nx.Graph(value) + +# Global registration - no context needed +schema = dj.schema("mydb") +``` + +### Key Differences + +| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) | +|--------|-------------------------------|----------------------------| +| Methods | `put()` / `get()` | `encode()` / `decode()` | +| Storage type | `attribute_type` | `dtype` | +| Type name | Variable name in context | `type_name` property | +| Registration | Context dict per schema | Global `@register_type` decorator | +| Validation | Manual | Built-in `validate()` method | +| Distribution | Copy adapter code | Entry point packages | +| Key access | Not available | Optional `key` parameter | + +## Best Practices + +1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) + +2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, external storage for large data + +3. **Add validation**: Use `validate()` to catch data errors early + +4. **Document your types**: Include docstrings explaining the expected input/output formats + +5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes + +6. **Consider versioning**: If your encoding format might change, include version information + +7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs + +```python +def test_graph_type_roundtrip(): + g = nx.lollipop_graph(4, 2) + t = GraphType() + + encoded = t.encode(g) + decoded = t.decode(encoded) + + assert set(g.edges) == set(decoded.edges) +``` + +## Built-in Types + +DataJoint includes a built-in type for explicit blob serialization: + +### `` - DataJoint Blob Serialization + +The `` type provides explicit control over DataJoint's native binary +serialization. It supports: + +- NumPy arrays (compatible with MATLAB) +- Python dicts, lists, tuples, sets +- datetime objects, Decimals, UUIDs +- Nested data structures +- Optional compression + +```python +@schema +class ProcessedData(dj.Manual): + definition = """ + data_id : int + --- + results : # Serialized Python objects + raw_bytes : longblob # Raw bytes (no serialization) + """ +``` + +#### When to Use `` + +- **Serialized data**: When storing Python objects (dicts, arrays, etc.) +- **New tables**: Prefer `` for automatic serialization +- **Migration**: Existing schemas with implicit serialization must migrate + +#### Raw Blob Behavior + +Plain `longblob` (and other blob variants) columns now store and return +**raw bytes** without automatic serialization: + +```python +@schema +class RawData(dj.Manual): + definition = """ + id : int + --- + raw_bytes : longblob # Stores/returns raw bytes + serialized : # Stores Python objects with serialization + """ + +# Raw bytes - no serialization +RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}}) + +row = (RawData & "id=1").fetch1() +row["raw_bytes"] # Returns: b"raw binary data" +row["serialized"] # Returns: {"key": "value"} +``` + +**Important**: Existing schemas that relied on implicit blob serialization +must be migrated to `` to preserve their behavior. + +## Schema Migration + +When upgrading existing schemas to use explicit type declarations, DataJoint +provides migration utilities. + +### Analyzing Blob Columns + +```python +import datajoint as dj + +schema = dj.schema("my_database") + +# Check migration status +status = dj.migrate.check_migration_status(schema) +print(f"Blob columns: {status['total_blob_columns']}") +print(f"Already migrated: {status['migrated']}") +print(f"Pending migration: {status['pending']}") +``` + +### Generating Migration SQL + +```python +# Preview migration (dry run) +result = dj.migrate.migrate_blob_columns(schema, dry_run=True) +for sql in result['sql_statements']: + print(sql) +``` + +### Applying Migration + +```python +# Apply migration +result = dj.migrate.migrate_blob_columns(schema, dry_run=False) +print(f"Migrated {result['migrated']} columns") +``` + +### Migration Details + +The migration updates MySQL column comments to include the type declaration. +This is a **metadata-only** change - the actual blob data format is unchanged. + +All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`. + +Before migration: +- Column: `longblob` (or `blob`, `mediumblob`, etc.) +- Comment: `user comment` +- Behavior: Auto-serialization (implicit) + +After migration: +- Column: `longblob` (unchanged) +- Comment: `::user comment` +- Behavior: Explicit serialization via `` + +### Updating Table Definitions + +After database migration, update your Python table definitions for consistency: + +```python +# Before +class MyTable(dj.Manual): + definition = """ + id : int + --- + data : longblob # stored data + """ + +# After +class MyTable(dj.Manual): + definition = """ + id : int + --- + data : # stored data + """ +``` + +Both definitions work identically after migration, but using `` makes +the serialization explicit and documents the intended behavior. diff --git a/docs/src/design/tables/attach.md b/docs/src/datatypes/attach.md similarity index 100% rename from docs/src/design/tables/attach.md rename to docs/src/datatypes/attach.md diff --git a/docs/src/datatypes/blob.md b/docs/src/datatypes/blob.md new file mode 100644 index 000000000..d7363906b --- /dev/null +++ b/docs/src/datatypes/blob.md @@ -0,0 +1,287 @@ +# Blobs + +Blob attributes store serialized Python objects in the database. DataJoint +automatically serializes objects on insert and deserializes them on fetch. + +## Defining Blob Attributes + +```python +@schema +class Recording(dj.Manual): + definition = """ + recording_id : int + --- + signal : longblob # numpy array + metadata : longblob # dictionary + timestamps : longblob # 1D array + """ +``` + +### Blob Sizes + +| Type | Max Size | Use Case | +|------|----------|----------| +| `tinyblob` | 255 bytes | Small binary data | +| `blob` | 64 KB | Small arrays | +| `mediumblob` | 16 MB | Medium arrays | +| `longblob` | 4 GB | Large arrays, images | + +Use `longblob` for most scientific data to avoid size limitations. + +## Inserting Blobs + +```python +import numpy as np + +# Insert numpy arrays +Recording.insert1({ + 'recording_id': 1, + 'signal': np.random.randn(10000, 64), # 10k samples, 64 channels + 'metadata': {'sampling_rate': 30000, 'gain': 1.5}, + 'timestamps': np.linspace(0, 10, 10000) +}) +``` + +### Supported Types + +DataJoint serializes these Python types: + +**Scalars** +```python +data = { + 'int_val': 42, + 'float_val': 3.14159, + 'bool_val': True, + 'str_val': 'hello world', +} +``` + +**Collections** +```python +data = { + 'list_val': [1, 2, 3, 4, 5], + 'tuple_val': (1, 'a', 3.14), + 'set_val': {1, 2, 3}, + 'dict_val': {'key1': 'value1', 'key2': [1, 2, 3]}, +} +``` + +**NumPy Arrays** +```python +data = { + 'array_1d': np.array([1, 2, 3, 4, 5]), + 'array_2d': np.random.randn(100, 100), + 'array_3d': np.zeros((10, 256, 256)), # e.g., video frames + 'complex_array': np.array([1+2j, 3+4j]), + 'structured': np.array([(1, 2.0), (3, 4.0)], + dtype=[('x', 'i4'), ('y', 'f8')]), +} +``` + +**Special Types** +```python +import uuid +from decimal import Decimal +from datetime import datetime, date + +data = { + 'uuid_val': uuid.uuid4(), + 'decimal_val': Decimal('3.14159265358979'), + 'datetime_val': datetime.now(), + 'date_val': date.today(), +} +``` + +## Fetching Blobs + +Blobs are automatically deserialized on fetch: + +```python +# Fetch entire entity +record = (Recording & 'recording_id=1').fetch1() +signal = record['signal'] # numpy array +metadata = record['metadata'] # dict + +# Fetch specific blob attribute +signal = (Recording & 'recording_id=1').fetch1('signal') +print(signal.shape) # (10000, 64) +print(signal.dtype) # float64 + +# Fetch multiple blobs +signal, timestamps = (Recording & 'recording_id=1').fetch1('signal', 'timestamps') +``` + +## External Storage + +For large blobs, use external storage to avoid database bloat: + +```python +@schema +class LargeData(dj.Manual): + definition = """ + data_id : int + --- + large_array : blob@external # stored outside database + """ +``` + +Configure external storage in settings: + +```json +{ + "stores": { + "external": { + "protocol": "file", + "location": "/data/blobs" + } + } +} +``` + +See [External Store](../admin/external-store.md) for configuration details. + +## Compression + +Blobs larger than 1 KiB are automatically compressed using zlib. This is +transparent to users—compression/decompression happens automatically. + +```python +# Large array is compressed automatically +large_data = np.random.randn(1000000) # ~8 MB uncompressed +Table.insert1({'data': large_data}) # Stored compressed +fetched = Table.fetch1('data') # Decompressed automatically +``` + +## Performance Tips + +### Use Appropriate Data Types + +```python +# Good: use float32 when float64 precision isn't needed +signal = signal.astype(np.float32) # Half the storage + +# Good: use appropriate integer sizes +counts = counts.astype(np.uint16) # If values < 65536 +``` + +### Avoid Storing Redundant Data + +```python +# Bad: store computed values that can be derived +Recording.insert1({ + 'signal': signal, + 'mean': signal.mean(), # Can be computed from signal + 'std': signal.std(), # Can be computed from signal +}) + +# Good: compute on fetch +signal = Recording.fetch1('signal') +mean, std = signal.mean(), signal.std() +``` + +### Consider Chunking Large Data + +```python +# For very large data, consider splitting into chunks +@schema +class VideoFrame(dj.Manual): + definition = """ + -> Video + frame_num : int + --- + frame : longblob + """ + +# Store frames individually rather than entire video +for i, frame in enumerate(video_frames): + VideoFrame.insert1({'video_id': 1, 'frame_num': i, 'frame': frame}) +``` + +## MATLAB Compatibility + +DataJoint's blob format is compatible with MATLAB's mYm serialization, +allowing data sharing between Python and MATLAB pipelines: + +```python +# Data inserted from Python +Table.insert1({'data': np.array([[1, 2], [3, 4]])}) +``` + +```matlab +% Fetched in MATLAB +data = fetch1(Table, 'data'); +% data is a 2x2 matrix +``` + +## Common Patterns + +### Store Model Weights + +```python +@schema +class TrainedModel(dj.Computed): + definition = """ + -> TrainingRun + --- + weights : longblob + architecture : varchar(100) + accuracy : float + """ + + def make(self, key): + model = train_model(key) + self.insert1(dict( + key, + weights=model.get_weights(), + architecture=model.name, + accuracy=evaluate(model) + )) +``` + +### Store Image Data + +```python +@schema +class Image(dj.Manual): + definition = """ + image_id : int + --- + pixels : longblob # HxWxC array + format : varchar(10) # 'RGB', 'RGBA', 'grayscale' + """ + +# Insert image +import imageio +img = imageio.imread('photo.png') +Image.insert1({'image_id': 1, 'pixels': img, 'format': 'RGB'}) + +# Fetch and display +import matplotlib.pyplot as plt +pixels = (Image & 'image_id=1').fetch1('pixels') +plt.imshow(pixels) +``` + +### Store Time Series + +```python +@schema +class TimeSeries(dj.Imported): + definition = """ + -> Recording + --- + data : longblob # NxT array (N channels, T samples) + sampling_rate : float # Hz + start_time : float # seconds + """ + + def make(self, key): + data, sr, t0 = load_recording(key) + self.insert1(dict(key, data=data, sampling_rate=sr, start_time=t0)) +``` + +## Limitations + +- Blob content is opaque to SQL queries (can't filter by array values) +- Large blobs increase database backup size +- Consider [object type](object.md) for very large files or cloud storage +- Avoid storing objects with external references (file handles, connections) diff --git a/docs/src/design/tables/filepath.md b/docs/src/datatypes/filepath.md similarity index 97% rename from docs/src/design/tables/filepath.md rename to docs/src/datatypes/filepath.md index 05e9ca744..8d0171f1c 100644 --- a/docs/src/design/tables/filepath.md +++ b/docs/src/datatypes/filepath.md @@ -16,7 +16,7 @@ tables to reference data which reside outside of DataJoint pipelines. To define a table using the `filepath` datatype, an existing DataJoint -[store](../../sysadmin/external-store.md) should be created and then referenced in the +[store](../admin/external-store.md) should be created and then referenced in the new table definition. For example, given a simple store: ```python diff --git a/docs/src/datatypes/object.md b/docs/src/datatypes/object.md new file mode 100644 index 000000000..e2ed8bf25 --- /dev/null +++ b/docs/src/datatypes/object.md @@ -0,0 +1,357 @@ +# Object Type + +The `object` type provides managed file and folder storage for DataJoint pipelines. Unlike `attach@store` and `filepath@store` which reference named stores, the `object` type uses a unified storage backend configured at the pipeline level. + +## Overview + +The `object` type supports both files and folders: + +- **Files**: Copied to storage at insert time, accessed via handle on fetch +- **Folders**: Entire directory trees stored as a unit (e.g., Zarr arrays) +- **Staged inserts**: Write directly to storage for large objects + +### Key Features + +- **Unified storage**: One storage backend per pipeline (local filesystem or cloud) +- **No hidden tables**: Metadata stored inline as JSON (simpler than `attach@store`) +- **fsspec integration**: Direct access for Zarr, xarray, and other array libraries +- **Immutable objects**: Content cannot be modified after insert + +## Configuration + +Configure object storage in `datajoint.json`: + +```json +{ + "object_storage": { + "project_name": "my_project", + "protocol": "s3", + "bucket": "my-bucket", + "location": "my_project", + "endpoint": "s3.amazonaws.com" + } +} +``` + +For local filesystem storage: + +```json +{ + "object_storage": { + "project_name": "my_project", + "protocol": "file", + "location": "/data/my_project" + } +} +``` + +### Configuration Options + +| Setting | Required | Description | +|---------|----------|-------------| +| `project_name` | Yes | Unique project identifier | +| `protocol` | Yes | Storage backend: `file`, `s3`, `gcs`, `azure` | +| `location` | Yes | Base path or bucket prefix | +| `bucket` | For cloud | Bucket name (S3, GCS, Azure) | +| `endpoint` | For S3 | S3 endpoint URL | +| `partition_pattern` | No | Path pattern with `{attribute}` placeholders | +| `token_length` | No | Random suffix length (default: 8, range: 4-16) | + +### Environment Variables + +Settings can be overridden via environment variables: + +```bash +DJ_OBJECT_STORAGE_PROTOCOL=s3 +DJ_OBJECT_STORAGE_BUCKET=my-bucket +DJ_OBJECT_STORAGE_LOCATION=my_project +``` + +## Table Definition + +Define an object attribute in your table: + +```python +@schema +class Recording(dj.Manual): + definition = """ + subject_id : int + session_id : int + --- + raw_data : object # managed file storage + processed : object # another object attribute + """ +``` + +Note: No `@store` suffix needed—storage is determined by pipeline configuration. + +## Insert Operations + +### Inserting Files + +Insert a file by providing its local path: + +```python +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "/local/path/to/recording.dat" +}) +``` + +The file is copied to object storage and the path is stored as JSON metadata. + +### Inserting Folders + +Insert an entire directory: + +```python +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "/local/path/to/data_folder/" +}) +``` + +### Inserting from Remote URLs + +Insert from cloud storage or HTTP sources—content is copied to managed storage: + +```python +# From S3 +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "s3://source-bucket/path/to/data.dat" +}) + +# From Google Cloud Storage (e.g., collaborator data) +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "neural_data": "gs://collaborator-bucket/shared/experiment.zarr" +}) + +# From HTTP/HTTPS +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "https://example.com/public/data.dat" +}) +``` + +Supported protocols: `s3://`, `gs://`, `az://`, `http://`, `https://` + +Remote sources may require credentials configured via environment variables or fsspec configuration files. + +### Inserting from Streams + +Insert from a file-like object with explicit extension: + +```python +with open("/local/path/data.bin", "rb") as f: + Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": (".bin", f) + }) +``` + +### Staged Insert (Direct Write) + +For large objects like Zarr arrays, use staged insert to write directly to storage without a local copy: + +```python +import zarr + +with Recording.staged_insert1 as staged: + # Set primary key values first + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Create Zarr array directly in object storage + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(10000, 10000)) + z[:] = compute_large_array() + + # Assign to record + staged.rec['raw_data'] = z + +# On successful exit: metadata computed, record inserted +# On exception: storage cleaned up, no record inserted +``` + +The `staged_insert1` context manager provides: + +- `staged.rec`: Dict for setting attribute values +- `staged.store(field, ext)`: Returns `fsspec.FSMap` for Zarr/xarray +- `staged.open(field, ext, mode)`: Returns file handle for writing +- `staged.fs`: Direct fsspec filesystem access + +## Fetch Operations + +Fetching an object attribute returns an `ObjectRef` handle: + +```python +record = Recording.fetch1() +obj = record["raw_data"] + +# Access metadata (no I/O) +print(obj.path) # Storage path +print(obj.size) # Size in bytes +print(obj.ext) # File extension (e.g., ".dat") +print(obj.is_dir) # True if folder +``` + +### Reading File Content + +```python +# Read entire file as bytes +content = obj.read() + +# Open as file object +with obj.open() as f: + data = f.read() +``` + +### Working with Folders + +```python +# List contents +contents = obj.listdir() + +# Walk directory tree +for root, dirs, files in obj.walk(): + print(root, files) + +# Open specific file in folder +with obj.open("subdir/file.dat") as f: + data = f.read() +``` + +### Downloading Files + +Download to local filesystem: + +```python +# Download entire object +local_path = obj.download("/local/destination/") + +# Download specific file from folder +local_path = obj.download("/local/destination/", "subdir/file.dat") +``` + +### Integration with Zarr and xarray + +The `ObjectRef` provides direct fsspec access: + +```python +import zarr +import xarray as xr + +record = Recording.fetch1() +obj = record["raw_data"] + +# Open as Zarr array +z = zarr.open(obj.store, mode='r') +print(z.shape) + +# Open with xarray +ds = xr.open_zarr(obj.store) + +# Access fsspec filesystem directly +fs = obj.fs +files = fs.ls(obj.full_path) +``` + +### Verifying Integrity + +Verify that stored content matches metadata: + +```python +try: + obj.verify() + print("Object integrity verified") +except IntegrityError as e: + print(f"Verification failed: {e}") +``` + +For files, this checks size (and hash if available). For folders, it validates the manifest. + +## Storage Structure + +Objects are stored with a deterministic path structure: + +``` +{location}/{schema}/{Table}/objects/{pk_attrs}/{field}_{token}{ext} +``` + +Example: +``` +my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat +``` + +### Partitioning + +Use `partition_pattern` to organize files by attributes: + +```json +{ + "object_storage": { + "partition_pattern": "{subject_id}/{session_id}" + } +} +``` + +This promotes specified attributes to the path root for better organization: + +``` +my_project/subject_id=123/session_id=45/my_schema/Recording/objects/raw_data_Ax7bQ2kM.dat +``` + +## Database Storage + +The `object` type is stored as a JSON column containing metadata: + +```json +{ + "path": "my_schema/Recording/objects/subject_id=123/raw_data_Ax7bQ2kM.dat", + "size": 12345, + "hash": null, + "ext": ".dat", + "is_dir": false, + "timestamp": "2025-01-15T10:30:00Z", + "mime_type": "application/octet-stream" +} +``` + +For folders, the metadata includes `item_count` and a manifest file is stored alongside the folder in object storage. + +## Comparison with Other Types + +| Feature | `attach@store` | `filepath@store` | `object` | +|---------|----------------|------------------|----------| +| Store config | Per-attribute | Per-attribute | Per-pipeline | +| Path control | DataJoint | User-managed | DataJoint | +| Hidden tables | Yes | Yes | **No** | +| Backend | File/S3 only | File/S3 only | fsspec (any) | +| Metadata storage | External table | External table | Inline JSON | +| Folder support | No | No | **Yes** | +| Direct write | No | No | **Yes** | + +## Delete Behavior + +When a record is deleted: + +1. Database record is deleted first (within transaction) +2. Storage file/folder deletion is attempted after commit +3. File deletion failures are logged but don't fail the transaction + +Orphaned files (from failed deletes or crashed inserts) can be cleaned up using maintenance utilities. + +## Best Practices + +1. **Use staged insert for large objects**: Avoid copying multi-gigabyte files through local storage +2. **Set primary keys before calling `store()`**: The storage path depends on primary key values +3. **Use meaningful extensions**: Extensions like `.zarr`, `.hdf5` help identify content type +4. **Verify after critical inserts**: Call `obj.verify()` for important data +5. **Configure partitioning for large datasets**: Improves storage organization and browsing diff --git a/docs/src/design/autopopulate-2.0-spec.md b/docs/src/design/autopopulate-2.0-spec.md new file mode 100644 index 000000000..2e471cc5e --- /dev/null +++ b/docs/src/design/autopopulate-2.0-spec.md @@ -0,0 +1,726 @@ +# Autopopulate 2.0 Specification + +## Overview + +This specification redesigns the DataJoint job handling system to provide better visibility, control, and scalability for distributed computing workflows. The new system replaces the schema-level `~jobs` table with per-table job tables that offer richer status tracking, proper referential integrity, and dashboard-friendly monitoring. + +## Problem Statement + +### Current Jobs Table Limitations + +The existing `~jobs` table has significant limitations: + +1. **Limited status tracking**: Only supports `reserved`, `error`, and `ignore` statuses +2. **Functions as an error log**: Cannot efficiently track pending or completed jobs +3. **Poor dashboard visibility**: No way to monitor pipeline progress without querying multiple tables +4. **Key hashing obscures data**: Primary keys are stored as hashes, making debugging difficult +5. **No referential integrity**: Jobs table is independent of computed tables; orphaned jobs can accumulate + +### Key Source Limitations + +1. **Frequent manual modifications**: Subset operations require modifying `key_source` property +2. **Local visibility only**: Custom key sources are not accessible database-wide +3. **Performance bottleneck**: Multiple workers querying `key_source` simultaneously creates contention +4. **Codebase dependency**: Requires full pipeline codebase to determine pending work + +## Proposed Solution + +### Terminology + +- **Stale job**: A pending job whose upstream records have been deleted. The job references keys that no longer exist in `key_source`. Stale jobs are automatically cleaned up by `refresh()`. +- **Orphaned job**: A reserved job from a crashed or terminated process. The worker that reserved the job is no longer running, but the job remains in `reserved` status. Orphaned jobs must be cleared manually (see below). + +### Core Design Principles + +1. **Per-table jobs**: Each computed table gets its own hidden jobs table +2. **FK-derived primary keys**: Jobs table primary key includes only attributes derived from foreign keys in the target table's primary key (not additional primary key attributes) +3. **No FK constraints on jobs**: Jobs tables omit foreign key constraints for performance; stale jobs are cleaned by `refresh()` +4. **Rich status tracking**: Extended status values for full lifecycle visibility +5. **Automatic refresh**: `populate()` automatically refreshes the jobs queue (adding new jobs, removing stale ones) + +## Architecture + +### Jobs Table Structure + +Each `dj.Imported` or `dj.Computed` table `MyTable` will have an associated hidden jobs table `~my_table__jobs` with the following structure: + +``` +# Job queue for MyTable +subject_id : int +session_id : int +... # Only FK-derived primary key attributes (NO foreign key constraints) +--- +status : enum('pending', 'reserved', 'success', 'error', 'ignore') +priority : int # Lower = more urgent (0 = highest priority, default: 5) +created_time : datetime # When job was added to queue +scheduled_time : datetime # Process on or after this time (default: now) +reserved_time : datetime # When job was reserved (null if not reserved) +completed_time : datetime # When job completed (null if not completed) +duration : float # Execution duration in seconds (null if not completed) +error_message : varchar(2047) # Truncated error message +error_stack : mediumblob # Full error traceback +user : varchar(255) # Database user who reserved/completed job +host : varchar(255) # Hostname of worker +pid : int unsigned # Process ID of worker +connection_id : bigint unsigned # MySQL connection ID +version : varchar(255) # Code version (git hash, package version, etc.) +``` + +**Important**: The jobs table primary key includes only those attributes that come through foreign keys in the target table's primary key. Additional primary key attributes (if any) are excluded. This means: +- If a target table has primary key `(-> Subject, -> Session, method)`, the jobs table has primary key `(subject_id, session_id)` only +- Multiple target rows may map to a single job entry when additional PK attributes exist +- Jobs tables have **no foreign key constraints** for performance (stale jobs handled by `refresh()`) + +### Access Pattern + +Jobs are accessed as a property of the computed table: + +```python +# Current pattern (schema-level) +schema.jobs + +# New pattern (per-table) +MyTable.jobs + +# Examples +FilteredImage.jobs # Access jobs table +FilteredImage.jobs & 'status="error"' # Query errors +FilteredImage.jobs.refresh() # Refresh job queue +``` + +### Status Values + +| Status | Description | +|--------|-------------| +| `pending` | Job is queued and ready to be processed | +| `reserved` | Job is currently being processed by a worker | +| `success` | Job completed successfully (optional, depends on settings) | +| `error` | Job failed with an error | +| `ignore` | Job should be skipped (manually set, not part of automatic transitions) | + +### Status Transitions + +```mermaid +stateDiagram-v2 + state "(none)" as none1 + state "(none)" as none2 + none1 --> pending : refresh() + none1 --> ignore : ignore() + pending --> reserved : reserve() + reserved --> none2 : complete() + reserved --> success : complete()* + reserved --> error : error() + success --> pending : refresh()* + error --> none2 : delete() + success --> none2 : delete() + ignore --> none2 : delete() +``` + +- `complete()` deletes the job entry (default when `jobs.keep_completed=False`) +- `complete()*` keeps the job as `success` (when `jobs.keep_completed=True`) +- `refresh()*` re-pends a `success` job if its key is in `key_source` but not in target + +**Transition methods:** +- `refresh()` — Adds new jobs as `pending`; also re-pends `success` jobs if key is in `key_source` but not in target +- `ignore()` — Marks a key as `ignore` (can be called on keys not yet in jobs table) +- `reserve()` — Marks a pending job as `reserved` before calling `make()` +- `complete()` — Marks reserved job as `success`, or deletes it (based on `jobs.keep_completed` setting) +- `error()` — Marks reserved job as `error` with message and stack trace +- `delete()` — Inherited from `delete_quick()`; use `(jobs & condition).delete()` pattern + +**Manual status control:** +- `ignore` is set manually via `jobs.ignore(key)` and is not part of automatic transitions +- Jobs with `status='ignore'` are skipped by `populate()` and `refresh()` +- To reset an ignored job, delete it and call `refresh()`: `jobs.ignored.delete(); jobs.refresh()` + +## API Design + +### JobsTable Class + +```python +class JobsTable(Table): + """Hidden table managing job queue for a computed table.""" + + @property + def definition(self) -> str: + """Dynamically generated based on parent table's primary key.""" + ... + + def refresh( + self, + *restrictions, + delay: float = 0, + priority: int = 5, + stale_timeout: float = None + ) -> dict: + """ + Refresh the jobs queue: add new jobs and remove stale ones. + + Operations performed: + 1. Add new jobs: (key_source & restrictions) - target - jobs → insert as 'pending' + 2. Remove stale jobs: pending jobs older than stale_timeout whose keys + are no longer in key_source (upstream records were deleted) + + Args: + restrictions: Conditions to filter key_source + delay: Seconds from now until jobs become available for processing. + Default: 0 (jobs are immediately available). + Uses database server time to avoid client clock synchronization issues. + priority: Priority for new jobs (lower = more urgent). Default: 5 + stale_timeout: Seconds after which pending jobs are checked for staleness. + Jobs older than this are removed if their key is no longer + in key_source. Default from config: jobs.stale_timeout (3600s) + + Returns: + {'added': int, 'removed': int} - counts of jobs added and stale jobs removed + """ + ... + + def reserve(self, key: dict) -> bool: + """ + Attempt to reserve a job for processing. + + Updates status to 'reserved' if currently 'pending' and scheduled_time <= now. + No locking is used; rare conflicts are resolved by the make() transaction. + + Returns: + True if reservation successful, False if job not found or not pending. + """ + ... + + def complete(self, key: dict, duration: float = None) -> None: + """ + Mark a job as successfully completed. + + Updates status to 'success', records duration and completion time. + """ + ... + + def error(self, key: dict, error_message: str, error_stack: str = None) -> None: + """ + Mark a job as failed with error details. + + Updates status to 'error', records error message and stack trace. + """ + ... + + def ignore(self, key: dict) -> None: + """ + Mark a job to be ignored (skipped during populate). + + To reset an ignored job, delete it and call refresh(). + """ + ... + + # delete() is inherited from delete_quick() - no confirmation required + # Usage: (jobs & condition).delete() or jobs.errors.delete() + + @property + def pending(self) -> QueryExpression: + """Return query for pending jobs.""" + return self & 'status="pending"' + + @property + def reserved(self) -> QueryExpression: + """Return query for reserved jobs.""" + return self & 'status="reserved"' + + @property + def errors(self) -> QueryExpression: + """Return query for error jobs.""" + return self & 'status="error"' + + @property + def ignored(self) -> QueryExpression: + """Return query for ignored jobs.""" + return self & 'status="ignore"' + + @property + def completed(self) -> QueryExpression: + """Return query for completed jobs.""" + return self & 'status="success"' +``` + +### AutoPopulate Integration + +The `populate()` method is updated to use the new jobs table: + +```python +def populate( + self, + *restrictions, + suppress_errors: bool = False, + return_exception_objects: bool = False, + reserve_jobs: bool = False, + order: str = "original", + limit: int = None, + max_calls: int = None, + display_progress: bool = False, + processes: int = 1, + make_kwargs: dict = None, + # New parameters + priority: int = None, # Only process jobs at this priority or more urgent (lower values) + refresh: bool = True, # Refresh jobs queue if no pending jobs available +) -> dict: + """ + Populate the table by calling make() for each missing entry. + + New behavior with reserve_jobs=True: + 1. Fetch all non-stale pending jobs (ordered by priority ASC, scheduled_time ASC) + 2. For each pending job: + a. Mark job as 'reserved' (per-key, before make) + b. Call make(key) + c. On success: mark job as 'success' or delete (based on keep_completed) + d. On error: mark job as 'error' with message/stack + 3. If refresh=True and no pending jobs were found, call self.jobs.refresh() + and repeat from step 1 + 4. Continue until no more pending jobs or max_calls reached + """ + ... +``` + +### Progress and Monitoring + +```python +# Current progress reporting +remaining, total = MyTable.progress() + +# Enhanced progress with jobs table +MyTable.jobs.progress() # Returns detailed status breakdown + +# Example output: +# { +# 'pending': 150, +# 'reserved': 3, +# 'success': 847, +# 'error': 12, +# 'ignore': 5, +# 'total': 1017 +# } +``` + +### Priority and Scheduling + +Priority and scheduling are handled via `refresh()` parameters. Lower priority values are more urgent (0 = highest priority). Scheduling uses relative time (seconds from now) based on database server time. + +```python +# Add urgent jobs (priority=0 is most urgent) +MyTable.jobs.refresh(priority=0) + +# Add normal jobs (default priority=5) +MyTable.jobs.refresh() + +# Add low-priority background jobs +MyTable.jobs.refresh(priority=10) + +# Schedule jobs for future processing (2 hours from now) +MyTable.jobs.refresh(delay=2*60*60) # 7200 seconds + +# Schedule jobs for tomorrow (24 hours from now) +MyTable.jobs.refresh(delay=24*60*60) + +# Combine: urgent jobs with 1-hour delay +MyTable.jobs.refresh(priority=0, delay=3600) + +# Add urgent jobs for specific subjects +MyTable.jobs.refresh(Subject & 'priority="urgent"', priority=0) +``` + +## Implementation Details + +### Table Naming Convention + +Jobs tables follow the existing hidden table naming pattern: +- Table `FilteredImage` (stored as `__filtered_image`) +- Jobs table: `~filtered_image__jobs` (stored as `_filtered_image__jobs`) + +### Primary Key Derivation + +The jobs table primary key includes only those attributes derived from foreign keys in the target table's primary key: + +```python +# Example 1: FK-only primary key (simple case) +@schema +class FilteredImage(dj.Computed): + definition = """ + -> Image + --- + filtered_image : + """ +# Jobs table primary key: (image_id) — same as target + +# Example 2: Target with additional PK attribute +@schema +class Analysis(dj.Computed): + definition = """ + -> Recording + analysis_method : varchar(32) # Additional PK attribute + --- + result : float + """ +# Jobs table primary key: (recording_id) — excludes 'analysis_method' +# One job entry covers all analysis_method values for a given recording +``` + +The jobs table has **no foreign key constraints** for performance reasons. + +### Stale Job Handling + +Stale jobs are pending jobs whose upstream records have been deleted. Since there are no FK constraints on jobs tables, these jobs remain until cleaned up by `refresh()`: + +```python +# refresh() handles stale jobs automatically +result = FilteredImage.jobs.refresh() +# Returns: {'added': 10, 'removed': 3} # 3 stale jobs cleaned up + +# Stale detection logic: +# 1. Find pending jobs where created_time < (now - stale_timeout) +# 2. Check if their keys still exist in key_source +# 3. Remove pending jobs whose keys no longer exist +``` + +**Why not use foreign key cascading deletes?** +- FK constraints add overhead on every insert/update/delete operation +- Jobs tables are high-traffic (frequent reservations and status updates) +- Stale jobs are harmless until refresh—they simply won't match key_source +- The `refresh()` approach is more efficient for batch cleanup + +### Table Drop and Alter Behavior + +When an auto-populated table is **dropped**, its associated jobs table is automatically dropped: + +```python +# Dropping FilteredImage also drops ~filtered_image__jobs +FilteredImage.drop() +``` + +When an auto-populated table is **altered** (e.g., primary key changes), the jobs table is dropped and can be recreated via `refresh()`: + +```python +# Alter that changes primary key structure +# Jobs table is dropped since its structure no longer matches +FilteredImage.alter() + +# Recreate jobs table with new structure +FilteredImage.jobs.refresh() +``` + +### Lazy Table Creation + +Jobs tables are created automatically on first use: + +```python +# First call to populate with reserve_jobs=True creates the jobs table +FilteredImage.populate(reserve_jobs=True) +# Creates ~filtered_image__jobs if it doesn't exist, then populates + +# Alternatively, explicitly create/refresh the jobs table +FilteredImage.jobs.refresh() +``` + +The jobs table is created with a primary key derived from the target table's foreign key attributes. + +### Conflict Resolution + +Conflict resolution relies on the transaction surrounding each `make()` call. This applies regardless of whether `reserve_jobs=True` or `reserve_jobs=False`: + +- With `reserve_jobs=False`: Workers query `key_source` directly and may attempt the same key +- With `reserve_jobs=True`: Job reservation reduces conflicts but doesn't eliminate them entirely + +When two workers attempt to populate the same key: +1. Both call `make()` for the same key +2. First worker's `make()` transaction commits, inserting the result +3. Second worker's `make()` transaction fails with duplicate key error +4. Second worker catches the error, and the job returns to `pending` or `(none)` state + +**Important**: Only errors that occur *inside* `make()` are logged with `error` status. Duplicate key errors from collisions occur outside the `make()` logic and are handled silently—the job is either retried or reverts to `pending`/`(none)`. This distinction ensures the error log contains only genuine computation failures, not coordination artifacts. + +**Why this is acceptable**: +- The `make()` transaction guarantees data integrity +- Duplicate key error is a clean, expected signal (not a real error) +- With `reserve_jobs=True`, conflicts are rare (requires near-simultaneous reservation) +- Wasted computation is minimal compared to locking complexity + +### Job Reservation vs Pre-Partitioning + +The job reservation mechanism (`reserve_jobs=True`) allows workers to dynamically claim jobs from a shared queue. However, some orchestration systems may prefer to **pre-partition** jobs before distributing them to workers: + +```python +# Pre-partitioning example: orchestrator divides work explicitly +all_pending = FilteredImage.jobs.pending.fetch("KEY") + +# Split jobs among workers (e.g., by worker index) +n_workers = 4 +for worker_id in range(n_workers): + worker_jobs = all_pending[worker_id::n_workers] # Round-robin assignment + # Send worker_jobs to worker via orchestration system (Slurm, K8s, etc.) + +# Worker receives its assigned keys and processes them directly +for key in assigned_keys: + FilteredImage.populate(key, reserve_jobs=False) +``` + +**When to use each approach**: + +| Approach | Use Case | +|----------|----------| +| **Dynamic reservation** (`reserve_jobs=True`) | Simple setups, variable job durations, workers that start/stop dynamically | +| **Pre-partitioning** | Batch schedulers (Slurm, PBS), predictable job counts, avoiding reservation overhead | + +Both approaches benefit from the same transaction-based conflict resolution as a safety net. + +### Orphaned Job Handling + +Orphaned jobs are reserved jobs from crashed or terminated processes. The API does not provide an algorithmic method for detecting or clearing orphaned jobs because this is dependent on the orchestration system (e.g., Slurm job IDs, Kubernetes pod status, process heartbeats). + +Users must manually clear orphaned jobs using the `delete()` method: + +```python +# Delete all reserved jobs (use with caution - may kill active jobs!) +MyTable.jobs.reserved.delete() + +# Delete reserved jobs from a specific host that crashed +(MyTable.jobs.reserved & 'host="crashed-node"').delete() + +# Delete reserved jobs older than 1 hour (likely orphaned) +(MyTable.jobs.reserved & 'reserved_time < NOW() - INTERVAL 1 HOUR').delete() + +# Delete and re-add as pending +MyTable.jobs.reserved.delete() +MyTable.jobs.refresh() +``` + +**Note**: Deleting a reserved job does not terminate the running worker—it simply removes the reservation record. If the worker is still running, it will complete its `make()` call. If the job is then refreshed as pending and picked up by another worker, duplicated work may occur. Coordinate with your orchestration system to identify truly orphaned jobs before clearing them. + +## Configuration Options + +New configuration settings for job management: + +```python +# In datajoint config +dj.config['jobs.auto_refresh'] = True # Auto-refresh on populate (default: True) +dj.config['jobs.keep_completed'] = False # Keep success records (default: False) +dj.config['jobs.stale_timeout'] = 3600 # Seconds before pending job is considered stale (default: 3600) +dj.config['jobs.default_priority'] = 5 # Default priority for new jobs (lower = more urgent) +``` + +## Usage Examples + +### Basic Distributed Computing + +```python +# Worker 1 +FilteredImage.populate(reserve_jobs=True) + +# Worker 2 (can run simultaneously) +FilteredImage.populate(reserve_jobs=True) + +# Monitor progress +print(FilteredImage.jobs.progress()) +``` + +### Priority-Based Processing + +```python +# Add urgent jobs (priority=0 is most urgent) +urgent_subjects = Subject & 'priority="urgent"' +FilteredImage.jobs.refresh(urgent_subjects, priority=0) + +# Workers will process lowest-priority-value jobs first +FilteredImage.populate(reserve_jobs=True) +``` + +### Scheduled Processing + +```python +# Schedule jobs for overnight processing (8 hours from now) +FilteredImage.jobs.refresh('subject_id > 100', delay=8*60*60) + +# Only jobs whose scheduled_time <= now will be processed +FilteredImage.populate(reserve_jobs=True) +``` + +### Error Recovery + +```python +# View errors +errors = FilteredImage.jobs.errors.fetch(as_dict=True) +for err in errors: + print(f"Key: {err['subject_id']}, Error: {err['error_message']}") + +# Delete specific error jobs after fixing the issue +(FilteredImage.jobs & 'subject_id=42').delete() + +# Delete all error jobs +FilteredImage.jobs.errors.delete() + +# Re-add deleted jobs as pending (if keys still in key_source) +FilteredImage.jobs.refresh() +``` + +### Dashboard Queries + +```python +# Get pipeline-wide status using schema.jobs +def pipeline_status(schema): + return { + jt.target.table_name: jt.progress() + for jt in schema.jobs + } + +# Example output: +# { +# 'FilteredImage': {'pending': 150, 'reserved': 3, 'success': 847, 'error': 12}, +# 'Analysis': {'pending': 500, 'reserved': 0, 'success': 0, 'error': 0}, +# } + +# Refresh all jobs tables in the schema +for jobs_table in schema.jobs: + jobs_table.refresh() + +# Get all errors across the pipeline +all_errors = [] +for jt in schema.jobs: + errors = jt.errors.fetch(as_dict=True) + for err in errors: + err['_table'] = jt.target.table_name + all_errors.append(err) +``` + +## Backward Compatibility + +### Migration + +This is a major release. The legacy schema-level `~jobs` table is replaced by per-table jobs tables: + +- **Legacy `~jobs` table**: No longer used; can be dropped manually if present +- **New jobs tables**: Created automatically on first `populate(reserve_jobs=True)` call +- **No parallel support**: Teams should migrate cleanly to the new system + +### API Compatibility + +The `schema.jobs` property returns a list of all jobs table objects for auto-populated tables in the schema: + +```python +# Returns list of JobsTable objects +schema.jobs +# [FilteredImage.jobs, Analysis.jobs, ...] + +# Iterate over all jobs tables +for jobs_table in schema.jobs: + print(f"{jobs_table.target.table_name}: {jobs_table.progress()}") + +# Query all errors across the schema +all_errors = [job for jt in schema.jobs for job in jt.errors.fetch(as_dict=True)] + +# Refresh all jobs tables +for jobs_table in schema.jobs: + jobs_table.refresh() +``` + +This replaces the legacy single `~jobs` table with direct access to per-table jobs. + +## Hazard Analysis + +This section identifies potential hazards and their mitigations. + +### Race Conditions + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Simultaneous reservation** | Two workers reserve the same pending job at nearly the same time | Acceptable: duplicate `make()` calls are resolved by transaction—second worker gets duplicate key error | +| **Reserve during refresh** | Worker reserves a job while another process is running `refresh()` | No conflict: `refresh()` adds new jobs and removes stale ones; reservation updates existing rows | +| **Concurrent refresh calls** | Multiple processes call `refresh()` simultaneously | Acceptable: may result in duplicate insert attempts, but primary key constraint prevents duplicates | +| **Complete vs delete race** | One process completes a job while another deletes it | Acceptable: one operation succeeds, other becomes no-op (row not found) | + +### State Transitions + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Invalid state transition** | Code attempts illegal transition (e.g., pending → success) | Implementation enforces valid transitions; invalid attempts raise error | +| **Stuck in reserved** | Worker crashes while job is reserved (orphaned job) | Manual intervention required: `jobs.reserved.delete()` (see Orphaned Job Handling) | +| **Success re-pended unexpectedly** | `refresh()` re-pends a success job when user expected it to stay | Only occurs if `keep_completed=True` AND key exists in `key_source` but not in target; document clearly | +| **Ignore not respected** | Ignored jobs get processed anyway | Implementation must skip `status='ignore'` in `populate()` job fetching | + +### Data Integrity + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Stale job processed** | Job references deleted upstream data | `make()` will fail or produce invalid results; `refresh()` cleans stale jobs before processing | +| **Jobs table out of sync** | Jobs table doesn't match `key_source` | `refresh()` synchronizes; call periodically or rely on `populate(refresh=True)` | +| **Partial make failure** | `make()` partially succeeds then fails | DataJoint transaction rollback ensures atomicity; job marked as error | +| **Error message truncation** | Error details exceed `varchar(2047)` | Full stack stored in `error_stack` (mediumblob); `error_message` is summary only | + +### Performance + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Large jobs table** | Jobs table grows very large with `keep_completed=True` | Default is `keep_completed=False`; provide guidance on periodic cleanup | +| **Slow refresh on large key_source** | `refresh()` queries entire `key_source` | Can restrict refresh to subsets: `jobs.refresh(Subject & 'lab="smith"')` | +| **Many jobs tables per schema** | Schema with many computed tables has many jobs tables | Jobs tables are lightweight; only created on first use | + +### Operational + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Accidental job deletion** | User runs `jobs.delete()` without restriction | `delete()` inherits from `delete_quick()` (no confirmation); users must apply restrictions carefully | +| **Clearing active jobs** | User clears reserved jobs while workers are still running | May cause duplicated work if job is refreshed and picked up again; coordinate with orchestrator | +| **Priority confusion** | User expects higher number = higher priority | Document clearly: lower values are more urgent (0 = highest priority) | + +### Migration + +| Hazard | Description | Mitigation | +|--------|-------------|------------| +| **Legacy ~jobs table conflict** | Old `~jobs` table exists alongside new per-table jobs | Systems are independent; legacy table can be dropped manually | +| **Mixed version workers** | Some workers use old system, some use new | Major release; do not support mixed operation—require full migration | +| **Lost error history** | Migrating loses error records from legacy table | Document migration procedure; users can export legacy errors before migration | + +## Future Extensions + +- [ ] Web-based dashboard for job monitoring +- [ ] Webhook notifications for job completion/failure +- [ ] Job dependencies (job B waits for job A) +- [ ] Resource tagging (GPU required, high memory, etc.) +- [ ] Retry policies (max retries, exponential backoff) +- [ ] Job grouping/batching for efficiency +- [ ] Integration with external schedulers (Slurm, PBS, etc.) + +## Rationale + +### Why Not External Orchestration? + +The team considered integrating external tools like Airflow or Flyte but rejected this approach because: + +1. **Deployment complexity**: External orchestrators require significant infrastructure +2. **Maintenance burden**: Additional systems to maintain and monitor +3. **Accessibility**: Not all DataJoint users have access to orchestration platforms +4. **Tight integration**: DataJoint's transaction model requires close coordination + +The built-in jobs system provides 80% of the value with minimal additional complexity. + +### Why Per-Table Jobs? + +Per-table jobs tables provide: + +1. **Better isolation**: Jobs for one table don't affect others +2. **Simpler queries**: No need to filter by table_name +3. **Native keys**: Primary keys are readable, not hashed +4. **High performance**: No FK constraints means minimal overhead on job operations +5. **Scalability**: Each table's jobs can be indexed independently + +### Why Remove Key Hashing? + +The current system hashes primary keys to support arbitrary key types. The new system uses native keys because: + +1. **Readability**: Debugging is much easier with readable keys +2. **Query efficiency**: Native keys can use table indexes +3. **Foreign keys**: Hash-based keys cannot participate in foreign key relationships +4. **Simplicity**: No need for hash computation and comparison + +### Why FK-Derived Primary Keys Only? + +The jobs table primary key includes only attributes derived from foreign keys in the target table's primary key. This design: + +1. **Aligns with key_source**: The `key_source` query naturally produces keys matching the FK-derived attributes +2. **Simplifies job identity**: A job's identity is determined by its upstream dependencies +3. **Handles additional PK attributes**: When targets have additional PK attributes (e.g., `method`), one job covers all values for that attribute diff --git a/docs/src/design/integrity.md b/docs/src/design/integrity.md deleted file mode 100644 index cb7122755..000000000 --- a/docs/src/design/integrity.md +++ /dev/null @@ -1,218 +0,0 @@ -# Data Integrity - -The term **data integrity** describes guarantees made by the data management process -that prevent errors and corruption in data due to technical failures and human errors -arising in the course of continuous use by multiple agents. -DataJoint pipelines respect the following forms of data integrity: **entity -integrity**, **referential integrity**, and **group integrity** as described in more -detail below. - -## Entity integrity - -In a proper relational design, each table represents a collection of discrete -real-world entities of some kind. -**Entity integrity** is the guarantee made by the data management process that entities -from the real world are reliably and uniquely represented in the database system. -Entity integrity states that the data management process must prevent duplicate -representations or misidentification of entities. -DataJoint enforces entity integrity through the use of -[primary keys](./tables/primary.md). - -Entity integrity breaks down when a process allows data pertaining to the same -real-world entity to be entered into the database system multiple times. -For example, a school database system may use unique ID numbers to distinguish students. -Suppose the system automatically generates an ID number each time a student record is -entered into the database without checking whether a record already exists for that -student. -Such a system violates entity integrity, because the same student may be assigned -multiple ID numbers. -The ID numbers succeed in uniquely identifying each student record but fail to do so -for the actual students. - -Note that a database cannot guarantee or enforce entity integrity by itself. -Entity integrity is a property of the entire data management process as a whole, -including institutional practices and user actions in addition to database -configurations. - -## Referential integrity - -**Referential integrity** is the guarantee made by the data management process that -related data across the database remain present, correctly associated, and mutually -consistent. -Guaranteeing referential integrity means enforcing the constraint that no entity can -exist in the database without all the other entities on which it depends. -Referential integrity cannot exist without entity integrity: references to entity -cannot be validated if the identity of the entity itself is not guaranteed. - -Referential integrity fails when a data management process allows new data to be -entered that refers to other data missing from the database. -For example, assume that each electrophysiology recording must refer to the mouse -subject used during data collection. -Perhaps an experimenter attempts to insert ephys data into the database that refers to -a nonexistent mouse, due to a misspelling. -A system guaranteeing referential integrity, such as DataJoint, will refuse the -erroneous data. - -Enforcement of referential integrity does not stop with data ingest. -[Deleting](../manipulation/delete.md) data in DataJoint also deletes any dependent -downstream data. -Such cascading deletions are necessary to maintain referential integrity. -Consider the deletion of a mouse subject without the deletion of the experimental -sessions involving that mouse. -A database that allows such deletion will break referential integrity, as the -experimental sessions for the removed mouse depend on missing data. -Any data management process that allows data to be deleted with no consideration of -dependent data cannot maintain referential integrity. - -[Updating](../manipulation/update.md) data already present in a database system also -jeopardizes referential integrity. -For this reason, the DataJoint workflow does not include updates to entities once they -have been ingested into a pipeline. -Allowing updates to upstream entities would break the referential integrity of any -dependent data downstream. -For example, permitting a user to change the name of a mouse subject would invalidate -any experimental sessions that used that mouse, presuming the mouse name was part of -the primary key. -The proper way to change data in DataJoint is to delete the existing entities and to -insert corrected ones, preserving referential integrity. - -## Group integrity - -**Group integrity** denotes the guarantee made by the data management process that -entities composed of multiple parts always appear in their complete form. -Group integrity in DataJoint is formalized through -[master-part](./tables/master-part.md) relationships. -The master-part relationship has important implications for dependencies, because a -downstream entity depending on a master entity set may be considered to depend on the -parts as well. - -## Relationships - -In DataJoint, the term **relationship** is used rather generally to describe the -effects of particular configurations of [dependencies](./tables/dependencies.md) -between multiple entity sets. -It is often useful to classify relationships as one-to-one, many-to-one, one-to-many, -and many-to-many. - -In a **one-to-one relationship**, each entity in a downstream table has exactly one -corresponding entity in the upstream table. -A dependency of an entity set containing the death dates of mice on an entity set -describing the mice themselves would obviously be a one-to-one relationship, as in the -example below. - -```python -@schema -class Mouse(dj.Manual): -definition = """ -mouse_name : varchar(64) ---- -mouse_dob : datetime -""" - -@schema -class MouseDeath(dj.Manual): -definition = """ --> Mouse ---- -death_date : datetime -""" -``` - -![doc_1-1](../images/doc_1-1.png){: style="align:center"} - -In a **one-to-many relationship**, multiple entities in a downstream table may depend -on the same entity in the upstream table. -The example below shows a table containing individual channel data from multi-channel -recordings, representing a one-to-many relationship. - -```python -@schema -class EEGRecording(dj.Manual): -definition = """ --> Session -eeg_recording_id : int ---- -eeg_system : varchar(64) -num_channels : int -""" - -@schema -class ChannelData(dj.Imported): -definition = """ --> EEGRecording -channel_idx : int ---- -channel_data : longblob -""" -``` -![doc_1-many](../images/doc_1-many.png){: style="align:center"} - -In a **many-to-one relationship**, each entity in a table is associated with multiple -entities from another table. -Many-to-one relationships between two tables are usually established using a separate -membership table. -The example below includes a table of mouse subjects, a table of subject groups, and a -membership [part table](./tables/master-part.md) listing the subjects in each group. -A many-to-one relationship exists between the `Mouse` table and the `SubjectGroup` -table, with is expressed through entities in `GroupMember`. - -```python -@schema -class Mouse(dj.Manual): -definition = """ -mouse_name : varchar(64) ---- -mouse_dob : datetime -""" - -@schema -class SubjectGroup(dj.Manual): -definition = """ -group_number : int ---- -group_name : varchar(64) -""" - -class GroupMember(dj.Part): - definition = """ - -> master - -> Mouse - """ -``` - -![doc_many-1](../images/doc_many-1.png){: style="align:center"} - -In a **many-to-many relationship**, multiple entities in one table may each relate to -multiple entities in another upstream table. -Many-to-many relationships between two tables are usually established using a separate -association table. -Each entity in the association table links one entity from each of the two upstream -tables it depends on. -The below example of a many-to-many relationship contains a table of recording -modalities and a table of multimodal recording sessions. -Entities in a third table represent the modes used for each session. - -```python -@schema -class RecordingModality(dj.Lookup): -definition = """ -modality : varchar(64) -""" - -@schema -class MultimodalSession(dj.Manual): -definition = """ --> Session -modes : int -""" -class SessionMode(dj.Part): - definition = """ - -> master - -> RecordingModality - """ -``` - -![doc_many-many](../images/doc_many-many.png){: style="align:center"} - -The types of relationships between entity sets are expressed in the -[Diagram](diagrams.md) of a schema. diff --git a/docs/src/design/normalization.md b/docs/src/design/normalization.md deleted file mode 100644 index 000028396..000000000 --- a/docs/src/design/normalization.md +++ /dev/null @@ -1,117 +0,0 @@ -# Entity Normalization - -DataJoint uses a uniform way of representing any data. -It does so in the form of **entity sets**, unordered collections of entities of the -same type. -The term **entity normalization** describes the commitment to represent all data as -well-formed entity sets. -Entity normalization is a conceptual refinement of the -[relational data model](../concepts/data-model.md) and is the central principle of the -DataJoint model ([Yatsenko et al., 2018](https://arxiv.org/abs/1807.11104)). -Entity normalization leads to clear and logical database designs and to easily -comprehensible data queries. - -Entity sets are a type of **relation** -(from the [relational data model](../concepts/data-model.md)) and are often visualized -as **tables**. -Hence the terms **relation**, **entity set**, and **table** can be used interchangeably -when entity normalization is assumed. - -## Criteria of a well-formed entity set - -1. All elements of an entity set belong to the same well-defined and readily identified -**entity type** from the model world. -2. All attributes of an entity set are applicable directly to each of its elements, -although some attribute values may be missing (set to null). -3. All elements of an entity set must be distinguishable form each other by the same -primary key. -4. Primary key attribute values cannot be missing, i.e. set to null. -5. All elements of an entity set participate in the same types of relationships with -other entity sets. - -## Entity normalization in schema design - -Entity normalization applies to schema design in that the designer is responsible for -the identification of the essential entity types in their model world and of the -dependencies among the entity types. - -The term entity normalization may also apply to a procedure for refactoring a schema -design that does not meet the above criteria into one that does. -In some cases, this may require breaking up some entity sets into multiple entity sets, -which may cause some entities to be represented across multiple entity sets. -In other cases, this may require converting attributes into their own entity sets. -Technically speaking, entity normalization entails compliance with the -[Boyce-Codd normal form](https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form) -while lacking the representational power for the applicability of more complex normal -forms ([Kent, 1983](https://dl.acm.org/citation.cfm?id=358054)). -Adherence to entity normalization prevents redundancies in storage and data -manipulation anomalies. -The same criteria originally motivated the formulation of the classical relational -normal forms. - -## Entity normalization in data queries - -Entity normalization applies to data queries as well. -DataJoint's [query operators](../query/operators.md) are designed to preserve the -entity normalization of their inputs. -For example, the outputs of operators [restriction](../query/restrict.md), -[proj](../query/project.md), and [aggr](../query/aggregation.md) retain the same entity -type as the (first) input. -The [join](../query/join.md) operator produces a new entity type comprising the pairing -of the entity types of its inputs. -[Universal sets](../query/universals.md) explicitly introduce virtual entity sets when -necessary to accomplish a query. - -## Examples of poor normalization - -Design choices lacking entity normalization may lead to data inconsistencies or -anomalies. -Below are several examples of poorly normalized designs and their normalized -alternatives. - -### Indirect attributes - -All attributes should apply to the entity itself. -Avoid attributes that actually apply to one of the entity's other attributes. -For example, consider the table `Author` with attributes `author_name`, `institution`, -and `institution_address`. -The attribute `institution_address` should really be held in a separate `Institution` -table that `Author` depends on. - -### Repeated attributes - -Avoid tables with repeated attributes of the same category. -A better solution is to create a separate table that depends on the first (often a -[part table](../design/tables/master-part.md)), with multiple individual entities -rather than repeated attributes. -For example, consider the table `Protocol` that includes the attributes `equipment1`, -`equipment2`, and `equipment3`. -A better design would be to create a `ProtocolEquipment` table that links each entity -in `Protocol` with multiple entities in `Equipment` through -[dependencies](../design/tables/dependencies.md). - -### Attributes that do not apply to all entities - -All attributes should be relevant to every entity in a table. -Attributes that apply only to a subset of entities in a table likely belong in a -separate table containing only that subset of entities. -For example, a table `Protocol` should include the attribute `stimulus` only if all -experiment protocols include stimulation. -If the not all entities in `Protocol` involve stimulation, then the `stimulus` -attribute should be moved to a part table that has `Protocol` as its master. -Only protocols using stimulation will have an entry in this part table. - -### Transient attributes - -Attributes should be relevant to all entities in a table at all times. -Attributes that do not apply to all entities should be moved to another dependent table -containing only the appropriate entities. -This principle also applies to attributes that have not yet become meaningful for some -entities or that will not remain meaningful indefinitely. -For example, consider the table `Mouse` with attributes `birth_date` and `death_date`, -where `death_date` is set to `NULL` for living mice. -Since the `death_date` attribute is not meaningful for mice that are still living, -the proper design would include a separate table `DeceasedMouse` that depends on -`Mouse`. -`DeceasedMouse` would only contain entities for dead mice, which improves integrity and -averts the need for [updates](../manipulation/update.md). diff --git a/docs/src/design/recall.md b/docs/src/design/recall.md deleted file mode 100644 index 56226cabd..000000000 --- a/docs/src/design/recall.md +++ /dev/null @@ -1,207 +0,0 @@ -# Work with Existing Pipelines - -## Loading Classes - -This section describes how to work with database schemas without access to the -original code that generated the schema. These situations often arise when the -database is created by another user who has not shared the generating code yet -or when the database schema is created from a programming language other than -Python. - -```python -import datajoint as dj -``` - -### Working with schemas and their modules - -Typically a DataJoint schema is created as a dedicated Python module. This -module defines a schema object that is used to link classes declared in the -module to tables in the database schema. As an example, examine the university -module: [university.py](https://github.com/datajoint-company/db-programming-with-datajoint/blob/master/notebooks/university.py). - -You may then import the module to interact with its tables: - -```python -import university as uni -dj.Diagram(uni) -``` - -![query object preview](../images/virtual-module-ERD.svg){: style="align:center"} - -Note that dj.Diagram can extract the diagram from a schema object or from a -Python module containing its schema object, lending further support to the -convention of one-to-one correspondence between database schemas and Python -modules in a DataJoint project: - -`dj.Diagram(uni)` - -is equivalent to - -`dj.Diagram(uni.schema)` - -```python -# students without majors -uni.Student - uni.StudentMajor -``` - -![query object preview](../images/StudentTable.png){: style="align:center"} - -### Spawning missing classes - -Now imagine that you do not have access to `university.py` or you do not have -its latest version. You can still connect to the database schema but you will -not have classes declared to interact with it. - -So let's start over in this scenario. - -You may use the `dj.list_schemas` function (new in DataJoint 0.12.0) to -list the names of database schemas available to you. - -```python -import datajoint as dj -dj.list_schemas() -``` - -```text -*['dimitri_alter','dimitri_attach','dimitri_blob','dimitri_blobs', -'dimitri_nphoton','dimitri_schema','dimitri_university','dimitri_uuid', -'university']* -``` - -Just as with a new schema, we start by creating a schema object to connect to -the chosen database schema: - -```python -schema = dj.Schema('dimitri_university') -``` - -If the schema already exists, `dj.Schema` is initialized as usual and you may plot -the schema diagram. But instead of seeing class names, you will see the raw -table names as they appear in the database. - -```python -# let's plot its diagram -dj.Diagram(schema) -``` - -![query object preview](../images/dimitri-ERD.svg){: style="align:center"} - -You may view the diagram but, at this point, there is no way to interact with -these tables. A similar situation arises when another developer has added new -tables to the schema but has not yet shared the updated module code with you. -Then the diagram will show a mixture of class names and database table names. - -Now you may use the `spawn_missing_classes` method to spawn classes into -the local namespace for any tables missing their classes: - -```python -schema.spawn_missing_classes() -dj.Diagram(schema) -``` - -![query object preview](../images/spawned-classes-ERD.svg){: style="align:center"} - -Now you may interact with these tables as if they were declared right here in -this namespace: - -```python -# students without majors -Student - StudentMajor -``` - -![query object preview](../images/StudentTable.png){: style="align:center"} - -### Creating a virtual module - -Virtual modules provide a way to access the classes corresponding to tables in a -DataJoint schema without having to create local files. - -`spawn_missing_classes` creates the new classes in the local namespace. -However, it is often more convenient to import a schema with its Python module, -equivalent to the Python command: - -```python -import university as uni -``` - -We can mimic this import without having access to `university.py` using the -`VirtualModule` class object: - -```python -import datajoint as dj - -uni = dj.VirtualModule(module_name='university.py', schema_name='dimitri_university') -``` - -Now `uni` behaves as an imported module complete with the schema object and all -the table classes. - -```python -dj.Diagram(uni) -``` - -![query object preview](../images/added-example-ERD.svg){: style="align:center"} - -```python -uni.Student - uni.StudentMajor -``` - -![query object preview](../images/StudentTable.png){: style="align:center"} - -`dj.VirtualModule` takes required arguments - -- `module_name`: displayed module name. - -- `schema_name`: name of the database in MySQL. - -And `dj.VirtualModule` takes optional arguments. - -First, `create_schema=False` assures that an error is raised when the schema -does not already exist. Set it to `True` if you want to create an empty schema. - -```python -dj.VirtualModule('what', 'nonexistent') -``` - -Returns - -```python ---------------------------------------------------------------------------- -DataJointError Traceback (most recent call last) -. -. -. -DataJointError: Database named `nonexistent` was not defined. Set argument create_schema=True to create it. -``` - -The other optional argument, `create_tables=False` is passed to the schema -object. It prevents the use of the schema object of the virtual module for -creating new tables in the existing schema. This is a precautionary measure -since virtual modules are often used for completed schemas. You may set this -argument to `True` if you wish to add new tables to the existing schema. A -more common approach in this scenario would be to create a new schema object and -to use the `spawn_missing_classes` function to make the classes available. - -However, you if do decide to create new tables in an existing tables using the -virtual module, you may do so by using the schema object from the module as the -decorator for declaring new tables: - -```python -uni = dj.VirtualModule('university.py', 'dimitri_university', create_tables=True) -``` - -```python -@uni.schema -class Example(dj.Manual): - definition = """ - -> uni.Student - --- - example : varchar(255) - """ -``` - -```python -dj.Diagram(uni) -``` - -![query object preview](../images/added-example-ERD.svg){: style="align:center"} diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index b68f552e5..1967f8397 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -48,11 +48,12 @@ fractional digits. Because of its well-defined precision, `decimal` values can be used in equality comparison and be included in primary keys. -- `longblob`: arbitrary numeric array (e.g. matrix, image, structure), up to 4 +- `longblob`: raw binary data, up to 4 [GiB](http://en.wikipedia.org/wiki/Gibibyte) in size. - Numeric arrays are compatible between MATLAB and Python (NumPy). + Stores and returns raw bytes without serialization. + For serialized Python objects (arrays, dicts, etc.), use `` instead. The `longblob` and other `blob` datatypes can be configured to store data - [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax. + [externally](../../admin/external-store.md) by using the `blob@store` syntax. ## Less common (but supported) datatypes @@ -71,12 +72,23 @@ info). These types abstract certain kinds of non-database data to facilitate use together with DataJoint. +- ``: DataJoint's native serialization format for Python objects. Supports +NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with +MATLAB. See [custom types](customtype.md) for details. + +- `object`: managed [file and folder storage](object.md) with support for direct writes +(Zarr, HDF5) and fsspec integration. Recommended for new pipelines. + - `attach`: a [file attachment](attach.md) similar to email attachments facillitating sending/receiving an opaque data file to/from a DataJoint pipeline. - `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files into a DataJoint pipeline. +- ``: a [custom attribute type](customtype.md) that defines bidirectional +conversion between Python objects and database storage formats. Use this to store +complex data types like graphs, domain-specific objects, or custom data structures. + ## Numeric type aliases DataJoint provides convenient type aliases that map to standard MySQL numeric types. diff --git a/docs/src/design/tables/blobs.md b/docs/src/design/tables/blobs.md deleted file mode 100644 index 9f73d54d4..000000000 --- a/docs/src/design/tables/blobs.md +++ /dev/null @@ -1,26 +0,0 @@ -# Blobs - -DataJoint provides functionality for serializing and deserializing complex data types -into binary blobs for efficient storage and compatibility with MATLAB's mYm -serialization. This includes support for: - -+ Basic Python data types (e.g., integers, floats, strings, dictionaries). -+ NumPy arrays and scalars. -+ Specialized data types like UUIDs, decimals, and datetime objects. - -## Serialization and Deserialization Process - -Serialization converts Python objects into a binary representation for efficient storage -within the database. Deserialization converts the binary representation back into the -original Python object. - -Blobs over 1 KiB are compressed using the zlib library to reduce storage requirements. - -## Supported Data Types - -DataJoint supports the following data types for serialization: - -+ Scalars: Integers, floats, booleans, strings. -+ Collections: Lists, tuples, sets, dictionaries. -+ NumPy: Arrays, structured arrays, and scalars. -+ Custom Types: UUIDs, decimals, datetime objects, MATLAB cell and struct arrays. diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md deleted file mode 100644 index aad194ff5..000000000 --- a/docs/src/design/tables/customtype.md +++ /dev/null @@ -1,80 +0,0 @@ -# Custom Types - -In modern scientific research, data pipelines often involve complex workflows that -generate diverse data types. From high-dimensional imaging data to machine learning -models, these data types frequently exceed the basic representations supported by -traditional relational databases. For example: - -+ A lab working on neural connectivity might use graph objects to represent brain - networks. -+ Researchers processing raw imaging data might store custom objects for pre-processing - configurations. -+ Computational biologists might store fitted machine learning models or parameter - objects for downstream predictions. - -To handle these diverse needs, DataJoint provides the `dj.AttributeAdapter` method. It -enables researchers to store and retrieve complex, non-standard data types—like Python -objects or data structures—in a relational database while maintaining the -reproducibility, modularity, and query capabilities required for scientific workflows. - -## Uses in Scientific Research - -Imagine a neuroscience lab studying neural connectivity. Researchers might generate -graphs (e.g., networkx.Graph) to represent connections between brain regions, where: - -+ Nodes are brain regions. -+ Edges represent connections weighted by signal strength or another metric. - -Storing these graph objects in a database alongside other experimental data (e.g., -subject metadata, imaging parameters) ensures: - -1. Centralized Data Management: All experimental data and analysis results are stored - together for easy access and querying. -2. Reproducibility: The exact graph objects used in analysis can be retrieved later for - validation or further exploration. -3. Scalability: Graph data can be integrated into workflows for larger datasets or - across experiments. - -However, since graphs are not natively supported by relational databases, here’s where -`dj.AttributeAdapter` becomes essential. It allows researchers to define custom logic for -serializing graphs (e.g., as edge lists) and deserializing them back into Python -objects, bridging the gap between advanced data types and the database. - -### Example: Storing Graphs in DataJoint - -To store a networkx.Graph object in a DataJoint table, researchers can define a custom -attribute type in a datajoint table class: - -```python -import datajoint as dj - -class GraphAdapter(dj.AttributeAdapter): - - attribute_type = 'longblob' # this is how the attribute will be declared - - def put(self, obj): - # convert the nx.Graph object into an edge list - assert isinstance(obj, nx.Graph) - return list(obj.edges) - - def get(self, value): - # convert edge list back into an nx.Graph - return nx.Graph(value) - - -# instantiate for use as a datajoint type -graph = GraphAdapter() - - -# define a table with a graph attribute -schema = dj.schema('test_graphs') - - -@schema -class Connectivity(dj.Manual): - definition = """ - conn_id : int - --- - conn_graph = null : # a networkx.Graph object - """ -``` diff --git a/docs/src/design/tables/declare.md b/docs/src/design/tables/declare.md index d4fb070a2..2ebfb2e10 100644 --- a/docs/src/design/tables/declare.md +++ b/docs/src/design/tables/declare.md @@ -216,7 +216,7 @@ Such attributes must be uniquely named in each table, such as `session_start_tim Secondary attributes can be given default values. A default value will be used for an attribute if no other value is given at the time -the entity is [inserted](../../manipulation/insert.md) into the table. +the entity is [inserted](../../operations/insert.md) into the table. Generally, default values are numerical values or character strings. Default values for dates must be given as strings as well, contained within quotes (with the exception of `CURRENT_TIMESTAMP`). diff --git a/docs/src/design/tables/dependencies.md b/docs/src/design/tables/dependencies.md index e06278ee8..889deafe0 100644 --- a/docs/src/design/tables/dependencies.md +++ b/docs/src/design/tables/dependencies.md @@ -1,241 +1,378 @@ -# Dependencies +# Foreign Keys -## Understanding dependencies +Foreign keys define dependencies between tables. They link entities in one table +to entities in another, enabling both data relationships and workflow dependencies. -A schema contains collections of tables of related data. -Accordingly, entities in one table often derive some of their meaning or context from -entities in other tables. -A **foreign key** defines a **dependency** of entities in one table on entities in -another within a schema. -In more complex designs, dependencies can even exist between entities in tables from -different schemas. -Dependencies play a functional role in DataJoint and do not simply label the structure -of a pipeline. -Dependencies provide entities in one table with access to data in another table and -establish certain constraints on entities containing a foreign key. +## Basic Syntax -A DataJoint pipeline, including the dependency relationships established by foreign -keys, can be visualized as a graph with nodes and edges. -The diagram of such a graph is called the **entity relationship diagram** or -[Diagram](../diagrams.md). -The nodes of the graph are tables and the edges connecting them are foreign keys. -The edges are directed and the overall graph is a **directed acyclic graph**, a graph -with no loops. +Foreign keys use the arrow `->` notation in table definitions: -For example, the Diagram below is the pipeline for multipatching experiments +```python +@schema +class Session(dj.Manual): + definition = """ + -> Subject # references Subject table + session_date : date + --- + notes='' : varchar(2000) + """ +``` -![mp-diagram](../../images/mp-diagram.png){: style="align:center"} +This creates a dependency where each `Session` must reference an existing `Subject`. -The graph defines the direction of the workflow. -The tables at the top of the flow need to be populated first, followed by those tables -one step below and so forth until the last table is populated at the bottom of the -pipeline. -The top of the pipeline tends to be dominated by lookup tables (gray stars) and manual -tables (green squares). -The middle has many imported tables (blue triangles), and the bottom has computed -tables (red stars). +## Foreign Key Effects -## Defining a dependency +When table `B` references table `A` with `-> A`: -Foreign keys are defined with arrows `->` in the [table definition](declare.md), -pointing to another table. +1. **Attribute inheritance**: `A`'s primary key attributes become part of `B` +2. **Referential constraint**: Entities in `B` cannot exist without a matching entity in `A` +3. **Cascading delete**: Deleting from `A` automatically deletes dependent entities in `B` +4. **Automatic indexing**: Indexes are created to accelerate lookups -A foreign key may be defined as part of the [primary-key](primary.md). +## Primary vs Secondary Foreign Keys -In the Diagram, foreign keys from the primary key are shown as solid lines. -This means that the primary key of the referenced table becomes part of the primary key -of the new table. -A foreign key outside the primary key is indicated by dashed line in the ERD. +### Primary Key Foreign Keys (Solid Lines in Diagrams) -For example, the following definition for the table `mp.Slice` has three foreign keys, -including one within the primary key. +Foreign keys **above** the `---` line become part of the child's primary key: ```python -# brain slice --> mp.Subject -slice_id : smallint # slice number within subject ---- --> mp.BrainRegion --> mp.Plane -slice_date : date # date of the slicing (not patching) -thickness : smallint unsigned # slice thickness in microns -experimenter : varchar(20) # person who performed this experiment +@schema +class Trial(dj.Imported): + definition = """ + -> Session # part of primary key + trial_id : smallint # additional primary key attribute + --- + start_time : float # (seconds) + """ ``` -You can examine the resulting table heading with +The `Trial` table has primary key `(subject_id, session_date, trial_id)`. + +### Secondary Foreign Keys (Dashed Lines in Diagrams) + +Foreign keys **below** the `---` line are secondary attributes: ```python -mp.BrainSlice.heading +@schema +class Session(dj.Manual): + definition = """ + -> Subject + session_date : date + --- + -> [nullable] User # optional reference + notes='' : varchar(2000) + """ ``` -The heading of `mp.Slice` may look something like +## Complete Example Schema ```python -subject_id : char(8) # experiment subject id -slice_id : smallint # slice number within subject ---- -brain_region : varchar(12) # abbreviated name for brain region -plane : varchar(12) # plane of section -slice_date : date # date of the slicing (not patching) -thickness : smallint unsigned # slice thickness in microns -experimenter : varchar(20) # person who performed this experiment +import datajoint as dj +schema = dj.Schema('lab') + +@schema +class User(dj.Lookup): + definition = """ + username : varchar(20) + --- + full_name : varchar(100) + """ + contents = [ + ('alice', 'Alice Smith'), + ('bob', 'Bob Jones'), + ] + +@schema +class Subject(dj.Manual): + definition = """ + subject_id : int + --- + species : varchar(30) + date_of_birth : date + sex : enum('M', 'F', 'U') + """ + +@schema +class Session(dj.Manual): + definition = """ + -> Subject + session_date : date + --- + -> [nullable] User + session_notes='' : varchar(2000) + """ + +@schema +class Trial(dj.Imported): + definition = """ + -> Session + trial_id : smallint + --- + start_time : float # seconds + duration : float # seconds + """ ``` -This displayed heading reflects the actual attributes in the table. -The foreign keys have been replaced by the primary key attributes of the referenced -tables, including their data types and comments. +## Referential Integrity -## How dependencies work +Foreign keys enforce **referential integrity**—the guarantee that related data +remains consistent: -The foreign key `-> A` in the definition of table `B` has the following effects: +```python +# Insert a subject +Subject.insert1({'subject_id': 1, 'species': 'mouse', + 'date_of_birth': '2023-01-15', 'sex': 'M'}) -1. The primary key attributes of `A` are made part of `B`'s definition. -2. A referential constraint is created in `B` with reference to `A`. -3. If one does not already exist, an index is created to speed up searches in `B` for -matches to `A`. - (The reverse search is already fast because it uses the primary key of `A`.) +# Insert a session - requires existing subject +Session.insert1({'subject_id': 1, 'session_date': '2024-01-01', + 'username': 'alice'}) -A referential constraint means that an entity in `B` cannot exist without a matching -entity in `A`. -**Matching** means attributes in `B` that correspond to the primary key of `A` must -have the same values. -An attempt to insert an entity into `B` that does not have a matching counterpart in -`A` will fail. -Conversely, deleting an entity from `A` that has matching entities in `B` will result -in the deletion of those matching entities and so forth, recursively, downstream in the -pipeline. +# This fails - subject_id=999 doesn't exist +Session.insert1({'subject_id': 999, 'session_date': '2024-01-01'}) +# IntegrityError: Cannot add or update a child row: foreign key constraint fails +``` -When `B` references `A` with a foreign key, one can say that `B` **depends** on `A`. -In DataJoint terms, `B` is the **dependent table** and `A` is the **referenced table** -with respect to the foreign key from `B` to `A`. +### Cascading Deletes -Note to those already familiar with the theory of relational databases: The usage of -the words "depends" and "dependency" here should not be confused with the unrelated -concept of *functional dependencies* that is used to define normal forms. +Deleting a parent automatically deletes all dependent children: -## Referential integrity +```python +# Delete subject 1 - also deletes all its sessions and trials +(Subject & 'subject_id=1').delete() +``` -Dependencies enforce the desired property of databases known as -**referential integrity**. -Referential integrity is the guarantee made by the data management process that related -data across the database remain present, correctly associated, and mutually consistent. -Guaranteeing referential integrity means enforcing the constraint that no entity can -exist in the database without all the other entities on which it depends. -An entity in table `B` depends on an entity in table `A` when they belong to them or -are computed from them. +DataJoint prompts for confirmation showing all affected tables and entity counts. -## Dependencies with renamed attributes +## Foreign Key Options -In most cases, a dependency includes the primary key attributes of the referenced table -as they appear in its table definition. -Sometimes it can be helpful to choose a new name for a foreign key attribute that -better fits the context of the dependent table. -DataJoint provides the following [projection](../../query/project.md) syntax to rename -the primary key attributes when they are included in the new table. +### nullable -The dependency +Makes the reference optional: ```python --> Table.project(new_attr='old_attr') +@schema +class Session(dj.Manual): + definition = """ + -> Subject + session_date : date + --- + -> [nullable] User # experimenter may be unknown + """ ``` -renames the primary key attribute `old_attr` of `Table` as `new_attr` before -integrating it into the table definition. -Any additional primary key attributes will retain their original names. -For example, the table `Experiment` may depend on table `User` but rename the `user` -attribute into `operator` as follows: +With `nullable`, the `User` attributes can be `NULL` if no user is specified. + +### unique + +Enforces one-to-one relationships: ```python --> User.proj(operator='user') +@schema +class Equipment(dj.Manual): + definition = """ + equipment_id : int + --- + name : varchar(100) + -> [unique] User # each user owns at most one equipment + """ ``` -In the above example, an entity in the dependent table depends on exactly one entity in -the referenced table. -Sometimes entities may depend on multiple entities from the same table. -Such a design requires a way to distinguish between dependent attributes having the -same name in the reference table. -For example, a table for `Synapse` may reference the table `Cell` twice as -`presynaptic` and `postsynaptic`. -The table definition may appear as +### Combined Options ```python -# synapse between two cells --> Cell.proj(presynaptic='cell_id') --> Cell.proj(postsynaptic='cell_id') ---- -connection_strength : double # (pA) peak synaptic current +@schema +class Rig(dj.Manual): + definition = """ + rig_id : char(4) + --- + -> [unique, nullable] User # optionally assigned to at most one user + """ ``` -If the primary key of `Cell` is (`animal_id`, `slice_id`, `cell_id`), then the primary -key of `Synapse` resulting from the above definition will be (`animal_id`, `slice_id`, -`presynaptic`, `postsynaptic`). -Projection always returns all of the primary key attributes of a table, so `animal_id` -and `slice_id` are included, with their original names. +**Note**: Primary key foreign keys cannot be `nullable` since primary keys cannot +contain NULL values. They can be `unique`. -Note that the design of the `Synapse` table above imposes the constraint that the -synapse can only be found between cells in the same animal and in the same slice. +## Renamed Foreign Keys -Allowing representation of synapses between cells from different slices requires the -renamimg of `slice_id` as well: +Rename inherited attributes using projection syntax: + +### Single Attribute Rename ```python -# synapse between two cells --> Cell(presynaptic_slice='slice_id', presynaptic_cell='cell_id') --> Cell(postsynaptic_slice='slice_id', postsynaptic_cell='cell_id') ---- -connection_strength : double # (pA) peak synaptic current +@schema +class Experiment(dj.Manual): + definition = """ + experiment_id : int + --- + -> User.proj(experimenter='username') # rename 'username' to 'experimenter' + start_date : date + """ ``` -In this case, the primary key of `Synapse` will be (`animal_id`, `presynaptic_slice`, -`presynaptic_cell`, `postsynaptic_slice`, `postsynaptic_cell`). -This primary key still imposes the constraint that synapses can only form between cells -within the same animal but now allows connecting cells across different slices. +### Multiple References to Same Table -In the Diagram, renamed foreign keys are shown as red lines with an additional dot node -in the middle to indicate that a renaming took place. +When referencing a table multiple times, use renames to distinguish: -## Foreign key options +```python +@schema +class Synapse(dj.Manual): + definition = """ + -> Cell.proj(pre_cell='cell_id') # presynaptic cell + -> Cell.proj(post_cell='cell_id') # postsynaptic cell + --- + strength : float # synaptic strength + """ +``` -Note: Foreign key options are currently in development. +If `Cell` has primary key `(animal_id, slice_id, cell_id)`, then `Synapse` has +primary key `(animal_id, slice_id, pre_cell, post_cell)`. -Foreign keys allow the additional options `nullable` and `unique`, which can be -inserted in square brackets following the arrow. +### Fully Disambiguated References -For example, in the following table definition +To allow connections across slices, rename additional attributes: ```python -rig_id : char(4) # experimental rig ---- --> Person +@schema +class Synapse(dj.Manual): + definition = """ + -> Cell.proj(pre_slice='slice_id', pre_cell='cell_id') + -> Cell.proj(post_slice='slice_id', post_cell='cell_id') + --- + strength : float + """ ``` -each rig belongs to a person, but the table definition does not prevent one person -owning multiple rigs. -With the `unique` option, a person may only appear once in the entire table, which -means that no one person can own more than one rig. +Primary key: `(animal_id, pre_slice, pre_cell, post_slice, post_cell)` + +## Viewing Dependencies + +### Examine Table Heading ```python -rig_id : char(4) # experimental rig ---- --> [unique] Person +Session.heading +# Shows all attributes including those inherited via foreign keys ``` -With the `nullable` option, a rig may not belong to anyone, in which case the foreign -key attributes for `Person` are set to `NULL`: +### Entity Relationship Diagram ```python -rig_id : char(4) # experimental rig ---- --> [nullable] Person +dj.Diagram(schema) +# Visualize all tables and their dependencies ``` -Finally with both `unique` and `nullable`, a rig may or may not be owned by anyone and -each person may own up to one rig. +In diagrams: +- **Solid lines**: Primary key foreign keys +- **Dashed lines**: Secondary foreign keys +- **Red lines with dots**: Renamed foreign keys + +## Dependency Patterns + +### Hub Pattern + +Multiple tables reference a central table: + +```python +@schema +class Subject(dj.Manual): + definition = """ + subject_id : int + --- + ... + """ + +@schema +class Surgery(dj.Manual): + definition = """ + -> Subject + surgery_date : date + --- + ... + """ + +@schema +class Behavior(dj.Imported): + definition = """ + -> Subject + behavior_date : date + --- + ... + """ + +@schema +class Imaging(dj.Imported): + definition = """ + -> Subject + imaging_date : date + --- + ... + """ +``` + +### Chain Pattern + +Sequential processing pipeline: ```python -rig_id : char(4) # experimental rig ---- --> [unique, nullable] Person +@schema +class RawData(dj.Imported): + definition = """ + -> Session + --- + data : longblob + """ + +@schema +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + processed : longblob + """ + +@schema +class Analysis(dj.Computed): + definition = """ + -> ProcessedData + --- + result : float + """ ``` -Foreign keys made from the primary key cannot be nullable but may be unique. +### Fork/Join Pattern + +Multiple paths converging: + +```python +@schema +class NeuralData(dj.Imported): + definition = """ + -> Session + --- + spikes : longblob + """ + +@schema +class BehaviorData(dj.Imported): + definition = """ + -> Session + --- + events : longblob + """ + +@schema +class NeuralBehaviorAnalysis(dj.Computed): + definition = """ + -> NeuralData + -> BehaviorData + --- + correlation : float + """ +``` + +## Best Practices + +1. **Use meaningful names**: Choose descriptive table and attribute names +2. **Keep primary keys minimal**: Include only attributes necessary to identify entities +3. **Design for queries**: Consider what joins you'll need when placing foreign keys +4. **Avoid circular dependencies**: DataJoint requires a directed acyclic graph +5. **Use nullable sparingly**: Only when the reference is truly optional diff --git a/docs/src/design/tables/master-part.md b/docs/src/design/tables/master-part.md index 629bfb8ab..393bef6b2 100644 --- a/docs/src/design/tables/master-part.md +++ b/docs/src/design/tables/master-part.md @@ -26,8 +26,8 @@ class Segmentation(dj.Computed): -> Segmentation roi : smallint # roi number --- - roi_pixels : longblob # indices of pixels - roi_weights : longblob # weights of pixels + roi_pixels : # indices of pixels + roi_weights : # weights of pixels """ def make(self, key): @@ -68,7 +68,7 @@ directly. The only valid method to delete from a part table is to delete the master. This has been an unenforced rule, but upcoming versions of DataJoint will prohibit direct deletes from the master table. -DataJoint's [delete](../../manipulation/delete.md) operation is also enclosed in a +DataJoint's [delete](../../operations/delete.md) operation is also enclosed in a transaction. Together, the rules of master-part relationships ensure a key aspect of data integrity: @@ -86,23 +86,44 @@ For example: ```python @schema class ArrayResponse(dj.Computed): -definition = """ -array: int -""" - -class ElectrodeResponse(dj.Part): -definition = """ --> master -electrode: int # electrode number on the probe -""" - -class ChannelResponse(dj.Part): -definition = """ --> ElectrodeResponse -channel: int ---- -response: longblob # response of a channel -""" + definition = """ + -> ArrayInfo + --- + timestamp : datetime + """ + + class ElectrodeResponse(dj.Part): + definition = """ + -> master + electrode : int # electrode number on the probe + --- + electrode_signal : longblob + """ + + class ChannelResponse(dj.Part): + definition = """ + -> master.ElectrodeResponse + channel : int + --- + response : longblob # response of a channel + """ + + def make(self, key): + # Insert master record + self.insert1(dict(key, timestamp=datetime.now())) + + # Get electrode data and insert ElectrodeResponse parts + for electrode_id, electrode_data in enumerate(get_electrodes(key)): + electrode_key = dict(key, electrode=electrode_id) + self.ElectrodeResponse.insert1( + dict(electrode_key, electrode_signal=electrode_data['signal']) + ) + + # Insert ChannelResponse parts for each electrode + for channel_id, channel_data in enumerate(electrode_data['channels']): + self.ChannelResponse.insert1( + dict(electrode_key, channel=channel_id, response=channel_data) + ) ``` Conceptually, one or more channels belongs to an electrode, and one or more electrodes diff --git a/docs/src/design/tables/object-type-spec.md b/docs/src/design/tables/object-type-spec.md new file mode 100644 index 000000000..dea83c5f4 --- /dev/null +++ b/docs/src/design/tables/object-type-spec.md @@ -0,0 +1,1293 @@ +# Object Column Type Specification + +## Overview + +The `object` type introduces a new paradigm for managed file storage in DataJoint. Unlike existing `attach@store` and `filepath@store` types that reference named stores, the `object` type uses a **unified storage backend** that is tightly coupled with the schema and configured at the pipeline level. + +The `object` type supports both **files and folders**. Content is copied to storage at insert time, referenced via handle on fetch, and deleted when the record is deleted. + +### Immutability Contract + +Objects stored via the `object` type are **immutable after finalization**. Users agree to: +- **Insert (copy)**: Copy existing content to storage +- **Insert (staged)**: Reserve path, write directly, then finalize +- **Fetch**: Read content via handle (no modification) +- **Delete**: Remove content when record is deleted (only way to remove) + +Once an object is **finalized** (either via copy-insert or staged-insert completion), users must not directly modify it in the object store. + +#### Two Insert Modes + +| Mode | Use Case | Workflow | +|------|----------|----------| +| **Copy** | Small files, existing data | Local file → copy to storage → insert record | +| **Staged** | Large objects, Zarr, TileDB | Reserve path → write directly to storage → finalize record | + +### Augmented Schema vs External References + +The `object` type implements **Augmented Schema (AUS)** — a paradigm where the object store becomes a true extension of the relational database: + +- **DataJoint fully controls** the object store lifecycle +- **Only DataJoint writes** to the object store (users may have direct read access) +- **Tight coupling** between database and object store +- **Joint transaction management** on objects and database records +- **Single backend per pipeline** — all managed objects live together + +This is fundamentally different from **external references**, where DataJoint merely points to user-managed data: + +| Aspect | `object` (Augmented Schema) | `filepath@store` (External Reference) | +|--------|----------------------------|--------------------------------------| +| **Ownership** | DataJoint owns the data | User owns the data | +| **Writes** | Only via DataJoint | User writes directly | +| **Deletion** | DataJoint deletes on record delete | User manages lifecycle | +| **Multi-backend** | Single backend per pipeline | Multiple named stores | +| **Use case** | Pipeline-generated data | Collaborator data, legacy assets | + +**When to use each:** + +- Use `object` for data that DataJoint should own and manage as part of the schema (e.g., processed results, derived datasets) +- Use `filepath@store` for referencing externally-managed data across multiple backends (e.g., collaborator data on different cloud providers, legacy data that shouldn't be moved) + +## Storage Architecture + +### Single Storage Backend Per Pipeline + +Each DataJoint pipeline has **one** associated storage backend configured in `datajoint.json`. DataJoint fully controls the path structure within this backend. + +### Supported Backends + +DataJoint uses **[`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)** to ensure compatibility across multiple storage backends: + +- **Local storage** – POSIX-compliant file systems (e.g., NFS, SMB) +- **Cloud-based object storage** – Amazon S3, Google Cloud Storage, Azure Blob, MinIO + +## Project Structure + +A DataJoint project creates a structured hierarchical storage pattern: + +``` +📁 project_name/ +├── datajoint_store.json # store metadata (not client config) +├── 📁 schema_name/ +│ ├── 📁 Table1/ +│ │ ├── data.parquet # tabular data export (future) +│ │ └── 📁 objects/ # object storage for this table +│ │ ├── pk1=val1/pk2=val2/field1_token.dat +│ │ └── pk1=val1/pk2=val2/field2_token.zarr +│ ├── 📁 Table2/ +│ │ ├── data.parquet +│ │ └── 📁 objects/ +│ │ └── ... +``` + +### Object Storage Keys + +When using cloud object storage: + +``` +s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.dat +s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.zarr +``` + +## Configuration + +### Settings Structure + +Object storage is configured in `datajoint.json` using the existing settings system: + +```json +{ + "database.host": "localhost", + "database.user": "datajoint", + + "object_storage.project_name": "my_project", + "object_storage.protocol": "s3", + "object_storage.endpoint": "s3.amazonaws.com", + "object_storage.bucket": "my-bucket", + "object_storage.location": "my_project", + "object_storage.partition_pattern": "{subject_id}/{session_id}" +} +``` + +For local filesystem storage: + +```json +{ + "object_storage.project_name": "my_project", + "object_storage.protocol": "file", + "object_storage.location": "/data/my_project", + "object_storage.partition_pattern": "{subject_id}/{session_id}" +} +``` + +### Settings Schema + +| Setting | Type | Required | Description | +|---------|------|----------|-------------| +| `object_storage.project_name` | string | Yes | Unique project identifier (must match store metadata) | +| `object_storage.protocol` | string | Yes | Storage backend: `file`, `s3`, `gcs`, `azure` | +| `object_storage.location` | string | Yes | Base path or bucket prefix | +| `object_storage.bucket` | string | For cloud | Bucket name (S3, GCS, Azure) | +| `object_storage.endpoint` | string | For S3 | S3 endpoint URL | +| `object_storage.partition_pattern` | string | No | Path pattern with `{attribute}` placeholders | +| `object_storage.token_length` | int | No | Random suffix length for filenames (default: 8, range: 4-16) | +| `object_storage.access_key` | string | For cloud | Access key (can use secrets file) | +| `object_storage.secret_key` | string | For cloud | Secret key (can use secrets file) | + +### Environment Variables + +Settings can be overridden via environment variables: + +```bash +DJ_OBJECT_STORAGE_PROTOCOL=s3 +DJ_OBJECT_STORAGE_BUCKET=my-bucket +DJ_OBJECT_STORAGE_LOCATION=my_project +DJ_OBJECT_STORAGE_PARTITION_PATTERN="subject{subject_id}/session{session_id}" +``` + +### Secrets + +Credentials can be stored in the `.secrets/` directory: + +``` +.secrets/ +├── object_storage.access_key +└── object_storage.secret_key +``` + +### Partition Pattern + +The partition pattern is configured **per pipeline** (one per settings file). Placeholders use `{attribute_name}` syntax and are replaced with primary key values. + +```json +{ + "object_storage.partition_pattern": "subject{subject_id}/session{session_id}" +} +``` + +**Example with partitioning:** + +``` +s3://my-bucket/my_project/subject_id=123/session_id=45/schema_name/Recording/objects/raw_data_Ax7bQ2kM.dat +``` + +If no partition pattern is specified, files are organized directly under `{location}/{schema}/{Table}/objects/`. + +## Store Metadata (`datajoint_store.json`) + +Each object store contains a metadata file at its root that identifies the store and enables verification by DataJoint clients. This file is named `datajoint_store.json` to distinguish it from client configuration files (`datajoint.json`). + +### Location + +``` +{location}/datajoint_store.json +``` + +For cloud storage: +``` +s3://bucket/my_project/datajoint_store.json +``` + +### Content + +```json +{ + "project_name": "my_project", + "created": "2025-01-15T10:30:00Z", + "format_version": "1.0", + "datajoint_version": "0.15.0", + "database_host": "db.example.com", + "database_name": "my_project_db" +} +``` + +### Schema + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `project_name` | string | Yes | Unique project identifier | +| `created` | string | Yes | ISO 8601 timestamp of store creation | +| `format_version` | string | Yes | Store format version for compatibility | +| `datajoint_version` | string | Yes | DataJoint version that created the store | +| `database_host` | string | No | Database server hostname (for bidirectional mapping) | +| `database_name` | string | No | Database name (for bidirectional mapping) | + +The optional `database_host` and `database_name` fields enable bidirectional mapping between object stores and databases. This is informational only - not enforced at runtime. Administrators can alternatively ensure unique `project_name` values across their namespace, and managed platforms may handle this mapping externally. + +### Store Initialization + +The store metadata file is created when the first `object` attribute is used: + +``` +┌─────────────────────────────────────────────────────────┐ +│ 1. Client attempts first file operation │ +├─────────────────────────────────────────────────────────┤ +│ 2. Check if datajoint_store.json exists │ +│ ├─ If exists: verify project_name matches │ +│ └─ If not: create with current project_name │ +├─────────────────────────────────────────────────────────┤ +│ 3. On mismatch: raise DataJointError │ +└─────────────────────────────────────────────────────────┘ +``` + +### Client Verification + +DataJoint performs a basic verification on connect to ensure store-database cohesion: + +1. **On connect**: Client reads `datajoint_store.json` from store +2. **Verify**: `project_name` in client settings matches store metadata +3. **On mismatch**: Raise `DataJointError` with descriptive message + +```python +# Example error +DataJointError: Object store project name mismatch. + Client configured: "project_a" + Store metadata: "project_b" + Ensure all clients use the same object_storage.project_name setting. +``` + +### Administrative Responsibility + +A 1:1 correspondence is assumed between: +- Database location + `project_name` in client settings +- Object store + `project_name` in store metadata + +DataJoint performs basic verification but does **not** enforce this mapping. Administrators are responsible for ensuring correct configuration across all clients. + +## Syntax + +```python +@schema +class Recording(dj.Manual): + definition = """ + subject_id : int + session_id : int + --- + raw_data : object # managed file storage + processed : object # another object attribute + """ +``` + +Note: No `@store` suffix needed - storage is determined by pipeline configuration. + +## Database Storage + +The `object` type is stored as a `JSON` column in MySQL containing: + +**File example:** +```json +{ + "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", + "size": 12345, + "hash": null, + "ext": ".dat", + "is_dir": false, + "timestamp": "2025-01-15T10:30:00Z", + "mime_type": "application/octet-stream" +} +``` + +**File with optional hash:** +```json +{ + "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", + "size": 12345, + "hash": "sha256:abcdef1234...", + "ext": ".dat", + "is_dir": false, + "timestamp": "2025-01-15T10:30:00Z", + "mime_type": "application/octet-stream" +} +``` + +**Folder example:** +```json +{ + "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_pL9nR4wE", + "size": 567890, + "hash": null, + "ext": null, + "is_dir": true, + "timestamp": "2025-01-15T10:30:00Z", + "item_count": 42 +} +``` + +**Zarr example (large dataset, metadata fields omitted for performance):** +```json +{ + "path": "my_schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", + "size": null, + "hash": null, + "ext": ".zarr", + "is_dir": true, + "timestamp": "2025-01-15T10:30:00Z" +} +``` + +### JSON Schema + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `path` | string | Yes | Full path/key within storage backend (includes token) | +| `size` | integer/null | No | Total size in bytes (sum for folders), or null if not computed. See [Performance Considerations](#performance-considerations). | +| `hash` | string/null | Yes | Content hash with algorithm prefix, or null (default) | +| `ext` | string/null | Yes | File extension as tooling hint (e.g., `.dat`, `.zarr`) or null. See [Extension Field](#extension-field). | +| `is_dir` | boolean | Yes | True if stored content is a directory/key-prefix (e.g., Zarr store) | +| `timestamp` | string | Yes | ISO 8601 upload timestamp | +| `mime_type` | string | No | MIME type (files only, auto-detected from extension) | +| `item_count` | integer | No | Number of files (folders only), or null if not computed. See [Performance Considerations](#performance-considerations). | + +### Extension Field + +The `ext` field is a **tooling hint** that preserves the original file extension or provides a conventional suffix for directory-based formats. It is: + +- **Not a content-type declaration**: Unlike `mime_type`, it does not attempt to describe the internal content format +- **Useful for tooling**: Enables file browsers, IDEs, and other tools to display appropriate icons or suggest applications +- **Conventional for formats like Zarr**: The `.zarr` extension is recognized by the ecosystem even though a Zarr store contains mixed content (JSON metadata + binary chunks) + +For single files, `ext` is extracted from the source filename. For staged inserts (like Zarr), it can be explicitly provided. + +### Performance Considerations + +For large hierarchical data like Zarr stores, computing certain metadata can be expensive: + +- **`size`**: Requires listing all objects and summing their sizes. For stores with millions of chunks, this can take minutes or hours. +- **`item_count`**: Requires listing all objects. Same performance concern as `size`. +- **`hash`**: Requires reading all content. Explicitly not supported for staged inserts. + +**These fields are optional** and default to `null` for staged inserts. Users can explicitly request computation when needed, understanding the performance implications. + +### Content Hashing + +By default, **no content hash is computed** to avoid performance overhead for large objects. Storage backend integrity is trusted. + +**Optional hashing** can be requested per-insert: + +```python +# Default - no hash (fast) +Recording.insert1({..., "raw_data": "/path/to/large.dat"}) + +# Request hash computation +Recording.insert1({..., "raw_data": "/path/to/important.dat"}, hash="sha256") +``` + +Supported hash algorithms: `sha256`, `md5`, `xxhash` (xxh3, faster for large files) + +**Staged inserts never compute hashes** - data is written directly to storage without a local copy to hash. + +### Folder Manifests + +For folders (directories), a **manifest file** is created alongside the folder in the object store to enable integrity verification without computing content hashes: + +``` +raw_data_pL9nR4wE/ +raw_data_pL9nR4wE.manifest.json +``` + +**Manifest content:** +```json +{ + "files": [ + {"path": "file1.dat", "size": 1234}, + {"path": "subdir/file2.dat", "size": 5678}, + {"path": "subdir/file3.dat", "size": 91011} + ], + "total_size": 567890, + "item_count": 42, + "created": "2025-01-15T10:30:00Z" +} +``` + +**Design rationale:** +- Stored in object store (not database) to avoid bloating the JSON for folders with many files +- Placed alongside folder (not inside) to avoid polluting folder contents and interfering with tools like Zarr +- Enables self-contained verification without database access + +The manifest enables: +- Quick verification that all expected files exist +- Size validation without reading file contents +- Detection of missing or extra files + +### Filename Convention + +The stored filename is **always derived from the field name**: +- **Base name**: The attribute/field name (e.g., `raw_data`) +- **Extension**: Adopted from source file (copy insert) or optionally provided (staged insert) +- **Token**: Random suffix for collision avoidance + +``` +Stored filename = {field}_{token}{ext} + +Examples: + raw_data_Ax7bQ2kM.dat # file with .dat extension + raw_data_pL9nR4wE.zarr # Zarr directory with .zarr extension + raw_data_kM3nP2qR # directory without extension +``` + +This convention ensures: +- Consistent, predictable naming across all objects +- Field name visible in storage for easier debugging +- Extension preserved for MIME type detection and tooling compatibility + +## Path Generation + +Storage paths are **deterministically constructed** from record metadata, enabling bidirectional lookup between database records and stored files. + +### Path Components + +1. **Location** - from configuration (`object_storage.location`) +2. **Partition attributes** - promoted PK attributes (if `partition_pattern` configured) +3. **Schema name** - from the table's schema +4. **Table name** - the table class name +5. **Object directory** - `objects/` +6. **Primary key encoding** - remaining PK attributes and values +7. **Suffixed filename** - `{field}_{token}{ext}` + +### Path Template + +**Without partitioning:** +``` +{location}/{schema}/{Table}/objects/{pk_attr1}={pk_val1}/{pk_attr2}={pk_val2}/.../{field}_{token}{ext} +``` + +**With partitioning:** +``` +{location}/{partition_attr}={val}/.../schema/{Table}/objects/{remaining_pk_attrs}/.../{field}_{token}{ext} +``` + +Note: The `objects/` directory follows the table name, allowing each table folder to also contain tabular data exports (e.g., `data.parquet`) alongside the objects. + +### Partitioning + +The **partition pattern** allows promoting certain primary key attributes to the beginning of the path (after `location`). This organizes storage by high-level attributes like subject or experiment, enabling: +- Efficient data locality for related records +- Easier manual browsing of storage +- Potential for storage tiering by partition + +**Configuration:** +```json +{ + "object_storage.partition_pattern": "{subject_id}/{experiment_id}" +} +``` + +Partition attributes are extracted from the primary key and placed at the path root. Remaining PK attributes appear in their normal position. + +### Example Without Partitioning + +For a table: +```python +@schema +class Recording(dj.Manual): + definition = """ + subject_id : int + session_id : int + --- + raw_data : object + """ +``` + +Inserting `{"subject_id": 123, "session_id": 45, "raw_data": "/path/to/recording.dat"}` produces: + +``` +my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat +``` + +Note: The filename is `raw_data` (field name) with `.dat` extension (from source file). + +### Example With Partitioning + +With `partition_pattern = "{subject_id}"`: + +``` +my_project/subject_id=123/my_schema/Recording/objects/session_id=45/raw_data_Ax7bQ2kM.dat +``` + +The `subject_id` is promoted to the path root, grouping all files for subject 123 together regardless of schema or table. + +### Deterministic Bidirectional Mapping + +The path structure (excluding the random token) is fully deterministic: +- **Record → File**: Given a record's primary key, construct the path prefix to locate its file +- **File → Record**: Parse the path to extract schema, table, field, and primary key values + +This enables: +- Finding all files for a specific record +- Identifying which record a file belongs to +- Auditing storage against database contents + +The **random token** is stored in the JSON metadata to complete the full path. + +### Primary Key Value Encoding + +Primary key values are encoded directly in paths when they are simple, path-safe types: +- **Integers**: Used directly (`subject_id=123`) +- **Dates**: ISO format (`session_date=2025-01-15`) +- **Timestamps**: ISO format with safe separators (`created=2025-01-15T10-30-00`) +- **Simple strings**: Used directly if path-safe (`experiment=baseline`) + +**Conversion to path-safe strings** is applied only when necessary: +- Strings containing `/`, `\`, or other path-unsafe characters +- Very long strings (truncated with hash suffix) +- Binary or complex types (hashed) + +```python +# Direct encoding (no conversion needed) +subject_id=123 +session_date=2025-01-15 +trial_type=control + +# Converted encoding (path-unsafe characters) +filename=my%2Ffile.dat # "/" encoded +description=a1b2c3d4_abc123 # long string truncated + hash +``` + +### Filename Collision Avoidance + +To prevent filename collisions, each stored object receives a **random token suffix** appended to the field name: + +``` +field: raw_data, source: recording.dat +stored: raw_data_Ax7bQ2kM.dat + +field: image, source: scan.tiff +stored: image_pL9nR4wE.tiff + +field: neural_data (staged with .zarr) +stored: neural_data_kM3nP2qR.zarr +``` + +#### Token Suffix Specification + +- **Alphabet**: URL-safe and filename-safe Base64 characters: `A-Z`, `a-z`, `0-9`, `-`, `_` +- **Length**: Configurable via `object_storage.token_length` (default: 8, range: 4-16) +- **Generation**: Cryptographically random using `secrets.token_urlsafe()` + +At 8 characters with 64 possible values per character: 64^8 = 281 trillion combinations. + +#### Rationale + +- Avoids collisions without requiring existence checks +- Field name visible in storage for easier debugging/auditing +- URL-safe for web-based access to cloud storage +- Filesystem-safe across all supported platforms + +### No Deduplication + +Each insert stores a separate copy of the file, even if identical content was previously stored. This ensures: +- Clear 1:1 relationship between records and files +- Simplified delete behavior +- No reference counting complexity + +## Insert Behavior + +At insert time, the `object` attribute accepts: + +1. **Local file path** (string or `Path`): Path to an existing local file (extension extracted) +2. **Local folder path** (string or `Path`): Path to an existing local directory +3. **Remote URL** (string): URL to remote file or folder (`s3://`, `gs://`, `az://`, `http://`, `https://`) +4. **Tuple of (ext, stream)**: File-like object with explicit extension + +```python +# From local file path - extension (.dat) extracted from source +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "/local/path/to/recording.dat" +}) +# Stored as: raw_data_Ax7bQ2kM.dat + +# From local folder path - no extension +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "/local/path/to/data_folder/" +}) +# Stored as: raw_data_pL9nR4wE/ + +# From remote URL - copies from source to managed storage +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": "s3://source-bucket/path/to/data.dat" +}) +# Stored as: raw_data_kM3nP2qR.dat + +# From remote Zarr store (e.g., collaborator data on GCS) +Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "neural_data": "gs://collaborator-bucket/shared/experiment.zarr" +}) +# Copied to managed storage as: neural_data_pL9nR4wE.zarr + +# From stream with explicit extension +with open("/local/path/data.bin", "rb") as f: + Recording.insert1({ + "subject_id": 123, + "session_id": 45, + "raw_data": (".bin", f) + }) +# Stored as: raw_data_xY8zW3vN.bin +``` + +### Remote URL Support + +Remote URLs are detected by protocol prefix and handled via fsspec: + +| Protocol | Example | Notes | +|----------|---------|-------| +| `s3://` | `s3://bucket/path/file.dat` | AWS S3, MinIO | +| `gs://` | `gs://bucket/path/file.dat` | Google Cloud Storage | +| `az://` | `az://container/path/file.dat` | Azure Blob Storage | +| `http://` | `http://server/path/file.dat` | HTTP (read-only source) | +| `https://` | `https://server/path/file.dat` | HTTPS (read-only source) | + +**Authentication**: Remote sources may require credentials. fsspec uses standard credential discovery (environment variables, config files, IAM roles). For cross-cloud copies, ensure credentials are configured for both source and destination. + +**Performance note**: For large remote-to-remote copies, data flows through the client. This is acceptable for most use cases but may be slow for very large datasets. Future optimizations could include server-side copy for same-provider transfers. + +### Insert Processing Steps + +1. Validate input (file/folder exists, stream is readable) +2. Generate deterministic storage path with random token +3. **Copy content to storage backend** via `fsspec` +4. **If copy fails: abort insert** (no database operation attempted) +5. Compute content hash (SHA-256) +6. Build JSON metadata structure +7. Execute database INSERT + +### Copy-First Semantics + +The file/folder is copied to storage **before** the database insert is attempted: +- If the copy fails, the insert does not proceed +- If the copy succeeds but the database insert fails, an orphaned file may remain +- Orphaned files are acceptable due to the random token (no collision with future inserts) + +### Staged Insert (Direct Write Mode) + +For large objects like Zarr arrays, copying from local storage is inefficient. **Staged insert** allows writing directly to the destination. + +#### Why a Separate Method? + +Staged insert uses a dedicated `staged_insert1` method rather than co-opting `insert1` because: + +1. **Explicit over implicit** - Staged inserts have fundamentally different semantics (file creation happens during context, commit on exit). A separate method makes this explicit. +2. **Backward compatibility** - `insert1` returns `None` and doesn't support context manager protocol. Changing this could break existing code. +3. **Clear error handling** - The context manager semantics (success = commit, exception = rollback) are obvious with `staged_insert1`. +4. **Type safety** - The staged context exposes `.store()` for object fields. A dedicated method can return a properly-typed `StagedInsert` object. + +**Staged inserts are limited to `insert1`** (one row at a time). Multi-row inserts are not supported for staged operations. + +#### Basic Usage + +```python +# Stage an insert with direct object storage writes +with Recording.staged_insert1 as staged: + # Set primary key values + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Create object storage directly using store() + # Extension is optional - .zarr is conventional for Zarr arrays + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(10000, 10000), dtype='f4') + z[:] = compute_large_array() + + # Assign the created object to the record + staged.rec['raw_data'] = z + +# On successful exit: metadata computed, record inserted +# On exception: storage cleaned up, no record inserted +# Stored as: raw_data_Ax7bQ2kM.zarr +``` + +#### StagedInsert Interface + +```python +class StagedInsert: + """Context manager for staged insert operations.""" + + rec: dict[str, Any] # Record dict for setting attribute values + + def store(self, field: str, ext: str = "") -> fsspec.FSMap: + """ + Get an FSMap store for direct writes to an object field. + + Args: + field: Name of the object attribute + ext: Optional extension (e.g., ".zarr", ".hdf5") + + Returns: + fsspec.FSMap suitable for Zarr/xarray + """ + ... + + def open(self, field: str, ext: str = "", mode: str = "wb") -> IO: + """ + Open a file for direct writes to an object field. + + Args: + field: Name of the object attribute + ext: Optional extension (e.g., ".bin", ".dat") + mode: File mode (default: "wb") + + Returns: + File-like object for writing + """ + ... + + @property + def fs(self) -> fsspec.AbstractFileSystem: + """Return fsspec filesystem for advanced operations.""" + ... +``` + +#### Staged Insert Flow + +``` +┌─────────────────────────────────────────────────────────┐ +│ 1. Enter context: create StagedInsert with empty rec │ +├─────────────────────────────────────────────────────────┤ +│ 2. User sets primary key values in staged.rec │ +├─────────────────────────────────────────────────────────┤ +│ 3. User calls store()/open() to get storage handles │ +│ - Path reserved with random token on first call │ +│ - User writes data directly via fsspec │ +├─────────────────────────────────────────────────────────┤ +│ 4. User assigns object references to staged.rec │ +├─────────────────────────────────────────────────────────┤ +│ 5. On context exit (success): │ +│ - Compute metadata (size, hash, item_count) │ +│ - Execute database INSERT │ +├─────────────────────────────────────────────────────────┤ +│ 6. On context exit (exception): │ +│ - Delete any written data │ +│ - Re-raise exception │ +└─────────────────────────────────────────────────────────┘ +``` + +#### Zarr Example + +```python +import zarr +import numpy as np + +# Create a large Zarr array directly in object storage +with Recording.staged_insert1 as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Create Zarr hierarchy directly in object storage + # .zarr extension is optional but conventional + root = zarr.open(staged.store('neural_data', '.zarr'), mode='w') + root.create_dataset('timestamps', data=np.arange(1000000)) + root.create_dataset('waveforms', shape=(1000000, 82), chunks=(10000, 82)) + + # Write in chunks (streaming from acquisition) + for i, chunk in enumerate(data_stream): + root['waveforms'][i*10000:(i+1)*10000] = chunk + + # Assign to record + staged.rec['neural_data'] = root + +# Record automatically inserted with computed metadata +# Stored as: neural_data_kM3nP2qR.zarr +``` + +#### Multiple Object Fields + +```python +with Recording.staged_insert1 as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Write multiple object fields - extension optional + raw = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) + raw[:] = raw_array + + processed = zarr.open(staged.store('processed', '.zarr'), mode='w', shape=(100, 100)) + processed[:] = processed_array + + staged.rec['raw_data'] = raw + staged.rec['processed'] = processed + +# Stored as: raw_data_Ax7bQ2kM.zarr, processed_pL9nR4wE.zarr +``` + +#### Comparison: Copy vs Staged Insert + +| Aspect | Copy Insert | Staged Insert | +|--------|-------------|---------------| +| Data location | Must exist locally first | Written directly to storage | +| Efficiency | Copy overhead | No copy needed | +| Use case | Small files, existing data | Large arrays, streaming data | +| Cleanup on failure | Orphan possible | Cleaned up | +| API | `insert1({..., "field": path})` | `staged_insert1` context manager | +| Multi-row | Supported | Not supported (insert1 only) | + +## Transaction Handling + +Since storage backends don't support distributed transactions with MySQL, DataJoint uses a **copy-first** strategy. + +### Insert Transaction Flow + +``` +┌─────────────────────────────────────────────────────────┐ +│ 1. Validate input and generate storage path with token │ +├─────────────────────────────────────────────────────────┤ +│ 2. Copy file/folder to storage backend │ +│ └─ On failure: raise error, INSERT not attempted │ +├─────────────────────────────────────────────────────────┤ +│ 3. Compute hash and build JSON metadata │ +├─────────────────────────────────────────────────────────┤ +│ 4. Execute database INSERT │ +│ └─ On failure: orphaned file remains (acceptable) │ +├─────────────────────────────────────────────────────────┤ +│ 5. Commit database transaction │ +│ └─ On failure: orphaned file remains (acceptable) │ +└─────────────────────────────────────────────────────────┘ +``` + +### Failure Scenarios + +| Scenario | Result | Orphaned File? | +|----------|--------|----------------| +| Copy fails | Clean failure, no INSERT | No | +| DB insert fails | Error raised | Yes (acceptable) | +| DB commit fails | Error raised | Yes (acceptable) | + +### Orphaned Files + +Orphaned files (files in storage without corresponding database records) may accumulate due to: +- Failed database inserts after successful copy +- Process crashes +- Network failures + +**This is acceptable** because: +- Random tokens prevent collisions with future inserts +- Orphaned files can be identified by comparing storage contents with database records +- A separate cleanup procedure removes orphaned files during maintenance + +### Orphan Cleanup Procedure + +Orphan cleanup is a **separate maintenance operation** that must be performed during maintenance windows to avoid race conditions with concurrent inserts. + +```python +# Maintenance utility methods +schema.file_storage.find_orphaned() # List files not referenced in DB +schema.file_storage.cleanup_orphaned() # Delete orphaned files +``` + +**Important considerations:** +- Should be run during low-activity periods +- Uses transactions or locking to avoid race conditions with concurrent inserts +- Files recently uploaded (within a grace period) are excluded to handle in-flight inserts +- Provides dry-run mode to preview deletions before execution + +## Fetch Behavior + +On fetch, the `object` type returns a **handle** (`ObjectRef` object) to the stored content. **The file is not copied** - all operations access the storage backend directly. + +```python +record = Recording.fetch1() +file_ref = record["raw_data"] + +# Access metadata (no I/O) +print(file_ref.path) # Full storage path +print(file_ref.size) # File size in bytes +print(file_ref.hash) # Content hash (if computed) or None +print(file_ref.ext) # File extension (e.g., ".dat") or None +print(file_ref.is_dir) # True if stored content is a folder + +# Read content directly from storage backend +content = file_ref.read() # Returns bytes (files only) + +# Open as fsspec file object (files only) +with file_ref.open() as f: + data = f.read() + +# List contents (folders only) +contents = file_ref.listdir() # Returns list of relative paths + +# Access specific file within folder +with file_ref.open("subdir/file.dat") as f: + data = f.read() +``` + +### No Automatic Download + +Unlike `attach@store`, the `object` type does **not** automatically download content to a local path. Users access content directly through the `ObjectRef` handle, which streams from the storage backend. + +For local copies, users explicitly download: + +```python +# Download file to local destination +local_path = file_ref.download("/local/destination/") + +# Download specific file from folder +local_path = file_ref.download("/local/destination/", "subdir/file.dat") +``` + +## Implementation Components + +### 1. Settings Extension (`settings.py`) + +New `ObjectStorageSettings` class: + +```python +class ObjectStorageSettings(BaseSettings): + """Object storage configuration for object columns.""" + + model_config = SettingsConfigDict( + env_prefix="DJ_OBJECT_STORAGE_", + extra="forbid", + validate_assignment=True, + ) + + project_name: str | None = None # Must match store metadata + protocol: Literal["object", "s3", "gcs", "azure"] | None = None + location: str | None = None + bucket: str | None = None + endpoint: str | None = None + partition_pattern: str | None = None + token_length: int = Field(default=8, ge=4, le=16) + access_key: str | None = None + secret_key: SecretStr | None = None +``` + +Add to main `Config` class: + +```python +object_storage: ObjectStorageSettings = Field(default_factory=ObjectStorageSettings) +``` + +### 2. Storage Backend (`storage.py` - new module) + +- `StorageBackend` class wrapping `fsspec` +- Methods: `upload()`, `download()`, `open()`, `exists()`, `delete()` +- Path generation with partition support + +### 3. Type Declaration (`declare.py`) + +- Add `OBJECT` pattern: `object$` +- Add to `SPECIAL_TYPES` +- Substitute to `JSON` type in database + +### 4. Schema Integration (`schemas.py`) + +- Associate storage backend with schema +- Validate storage configuration on schema creation + +### 5. Insert Processing (`table.py`) + +- New `__process_file_attribute()` method +- Path generation using primary key and partition pattern +- Upload via storage backend + +### 6. Fetch Processing (`fetch.py`) + +- New `ObjectRef` class +- Lazy loading from storage backend +- Metadata access interface + +### 7. ObjectRef Class (`objectref.py` - new module) + +```python +@dataclass +class ObjectRef: + """Handle to a file or folder stored in the pipeline's storage backend.""" + + path: str + size: int + hash: str | None # content hash (if computed) or None + ext: str | None # file extension (e.g., ".dat") or None + is_dir: bool + timestamp: datetime + mime_type: str | None # files only, derived from ext + item_count: int | None # folders only + _backend: StorageBackend # internal reference + + # fsspec access (for Zarr, xarray, etc.) + @property + def fs(self) -> fsspec.AbstractFileSystem: + """Return fsspec filesystem for direct access.""" + ... + + @property + def store(self) -> fsspec.FSMap: + """Return FSMap suitable for Zarr/xarray.""" + ... + + @property + def full_path(self) -> str: + """Return full URI (e.g., 's3://bucket/path').""" + ... + + # File operations + def read(self) -> bytes: ... + def open(self, subpath: str | None = None, mode: str = "rb") -> IO: ... + + # Folder operations + def listdir(self, subpath: str = "") -> list[str]: ... + def walk(self) -> Iterator[tuple[str, list[str], list[str]]]: ... + + # Common operations + def download(self, destination: Path | str, subpath: str | None = None) -> Path: ... + def exists(self, subpath: str | None = None) -> bool: ... + + # Integrity verification + def verify(self) -> bool: + """ + Verify object integrity. + + For files: checks size matches, and hash if available. + For folders: validates manifest (all files exist with correct sizes). + + Returns True if valid, raises IntegrityError with details if not. + """ + ... +``` + +#### fsspec Integration + +The `ObjectRef` provides direct fsspec access for integration with array libraries: + +```python +import zarr +import xarray as xr + +record = Recording.fetch1() +obj_ref = record["raw_data"] + +# Direct Zarr access +z = zarr.open(obj_ref.store, mode='r') +print(z.shape) + +# Direct xarray access +ds = xr.open_zarr(obj_ref.store) + +# Use fsspec filesystem directly +fs = obj_ref.fs +files = fs.ls(obj_ref.full_path) +``` + +## Dependencies + +New dependency: `fsspec` with optional backend-specific packages: + +```toml +[project.dependencies] +fsspec = ">=2023.1.0" + +[project.optional-dependencies] +s3 = ["s3fs"] +gcs = ["gcsfs"] +azure = ["adlfs"] +``` + +### Storage Access Architecture + +The `object` type separates **data declaration** (the JSON metadata stored in the database) from **storage access** (the library used to read/write objects): + +- **Data declaration**: The JSON schema (path, size, hash, etc.) is a pure data structure with no library dependencies +- **Storage access**: Currently uses `fsspec` as the default accessor, but the architecture supports alternative backends + +**Why this matters**: While `fsspec` is a mature and widely-used library, alternatives like [`obstore`](https://github.com/developmentseed/obstore) offer performance advantages for certain workloads. By keeping the data model independent of the access library, future versions can support pluggable storage accessors without schema changes. + +**Current implementation**: The `ObjectRef` class provides fsspec-based accessors (`fs`, `store` properties). Future versions may add: +- Pluggable accessor interface +- Alternative backends (obstore, custom implementations) +- Backend selection per-operation or per-configuration + +## Comparison with Existing Types + +| Feature | `attach@store` | `filepath@store` | `object` | +|---------|----------------|------------------|--------| +| Store config | Per-attribute | Per-attribute | Per-pipeline | +| Path control | DataJoint | User-managed | DataJoint | +| DB column | binary(16) UUID | binary(16) UUID | JSON | +| Hidden tables | Yes (external) | Yes (external) | **No** | +| Backend | File/S3 only | File/S3 only | fsspec (any) | +| Partitioning | Hash-based | User path | Configurable | +| Metadata storage | External table | External table | Inline JSON | +| Deduplication | By content | By path | None | + +### No Hidden Tables + +A key architectural difference: the `object` type does **not** use hidden external tables. + +The legacy `attach@store` and `filepath@store` types store a UUID in the table column and maintain a separate hidden `~external_*` table containing: +- File paths/keys +- Checksums +- Size information +- Reference counts + +The `object` type eliminates this complexity by storing all metadata **inline** in the JSON column. This provides: +- **Simpler schema** - no hidden tables to manage or migrate +- **Self-contained records** - all information in one place +- **Easier debugging** - metadata visible directly in queries +- **No reference counting** - each record owns its object exclusively + +### Legacy Type Deprecation + +The existing `attach@store` and `filepath@store` types will be: +- **Maintained** for backward compatibility with existing pipelines +- **Deprecated** in future releases with migration warnings +- **Eventually removed** after a transition period + +New pipelines should use the `object` type exclusively. + +## Delete Behavior + +When a record with a `object` attribute is deleted: + +1. **Database delete executes first** (within transaction) +2. **File delete is attempted** after successful DB commit +3. **File delete is best-effort** - the delete transaction succeeds even if file deletion fails + +### Delete Transaction Flow + +``` +┌─────────────────────────────────────────────────────────┐ +│ 1. Execute database DELETE │ +├─────────────────────────────────────────────────────────┤ +│ 2. Commit database transaction │ +│ └─ On failure: rollback, files unchanged │ +├─────────────────────────────────────────────────────────┤ +│ 3. Issue delete command to storage backend │ +│ └─ On failure: log warning, transaction still OK │ +└─────────────────────────────────────────────────────────┘ +``` + +### Stale Files + +If file deletion fails (network error, permissions, etc.), **stale files** may remain in storage. This is acceptable because: +- The database record is already deleted (authoritative source) +- Random tokens prevent any collision with future inserts +- Stale files can be identified and cleaned via orphan detection utilities + +### No Reference Counting + +Each record owns its file exclusively. There is no deduplication or reference counting, simplifying delete logic. + +## Migration Path + +- Existing `attach@store` and `filepath@store` remain unchanged +- `object` type is additive - new tables only +- Future: Migration utilities to convert existing external storage + +## Zarr, TileDB, and Large Hierarchical Data + +The `object` type is designed with **chunk-based formats** like Zarr and TileDB in mind. These formats store each chunk as a separate object, which maps naturally to object storage. + +### Staged Insert Compatibility + +**Staged inserts work with formats that support chunk-based writes:** + +| Format | Staged Insert | Why | +|--------|---------------|-----| +| **Zarr** | ✅ Yes | Each chunk is a separate object | +| **TileDB** | ✅ Yes | Fragment-based storage maps to objects | +| **HDF5** | ❌ No | Single monolithic file requires random-access seek/write | + +**HDF5 limitation**: HDF5 files have internal B-tree structures that require random-access modifications. Object storage only supports full object PUT/GET operations, not partial updates. For HDF5, use **copy insert**: + +```python +# HDF5: Write locally, then copy to object storage +import h5py +import tempfile + +with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as f: + with h5py.File(f.name, 'w') as h5: + h5.create_dataset('data', data=large_array) + Recording.insert1({..., 'data_file': f.name}) +``` + +For cloud-native workflows with large arrays, **Zarr is recommended** over HDF5. + +### Recommended Workflow (Zarr) + +For large Zarr stores, use **staged insert** to write directly to object storage: + +```python +import zarr +import numpy as np + +with Recording.staged_insert1 as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Write Zarr directly to object storage + store = staged.store('neural_data', '.zarr') + root = zarr.open(store, mode='w') + root.create_dataset('spikes', shape=(1000000, 384), chunks=(10000, 384), dtype='f4') + + # Stream data without local intermediate copy + for i, chunk in enumerate(acquisition_stream): + root['spikes'][i*10000:(i+1)*10000] = chunk + + staged.rec['neural_data'] = root + +# Metadata recorded, no expensive size/hash computation +``` + +### JSON Metadata for Zarr + +For Zarr stores, the recommended JSON metadata omits expensive-to-compute fields: + +```json +{ + "path": "schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", + "size": null, + "hash": null, + "ext": ".zarr", + "is_dir": true, + "timestamp": "2025-01-15T10:30:00Z" +} +``` + +**Field notes for Zarr:** +- **`size`**: Set to `null` - computing total size requires listing all chunks +- **`hash`**: Always `null` for staged inserts - no merkle tree support currently +- **`ext`**: Set to `.zarr` as a conventional tooling hint +- **`is_dir`**: Set to `true` - Zarr stores are key prefixes (logical directories) +- **`item_count`**: Omitted - counting chunks is expensive and rarely useful +- **`mime_type`**: Omitted - Zarr contains mixed content types + +### Reading Zarr Data + +The `ObjectRef` provides direct access compatible with Zarr and xarray: + +```python +record = Recording.fetch1() +obj_ref = record['neural_data'] + +# Direct Zarr access +z = zarr.open(obj_ref.store, mode='r') +print(z['spikes'].shape) + +# xarray integration +ds = xr.open_zarr(obj_ref.store) + +# Dask integration (lazy loading) +import dask.array as da +arr = da.from_zarr(obj_ref.store, component='spikes') +``` + +### Performance Tips + +1. **Use chunked writes**: Write data in chunks that match your Zarr chunk size +2. **Avoid metadata computation**: Let `size` and `item_count` default to `null` +3. **Use appropriate chunk sizes**: Balance between too many small files (overhead) and too few large files (memory) +4. **Consider compression**: Configure Zarr compression (blosc, zstd) to reduce storage costs + +## Future Extensions + +- [ ] Compression options (gzip, lz4, zstd) +- [ ] Encryption at rest +- [ ] Versioning support +- [ ] Streaming upload for large files +- [ ] Checksum verification on fetch +- [ ] Cache layer for frequently accessed files +- [ ] Parallel upload/download for large folders diff --git a/docs/src/design/tables/tiers.md b/docs/src/design/tables/tiers.md index 2cf1f9428..a58466a8a 100644 --- a/docs/src/design/tables/tiers.md +++ b/docs/src/design/tables/tiers.md @@ -1,68 +1,233 @@ # Data Tiers -DataJoint assigns all tables to one of the following data tiers that differentiate how -the data originate. - -## Table tiers - -| Tier | Superclass | Description | -| -- | -- | -- | -| Lookup | `dj.Lookup` | Small tables containing general facts and settings of the data pipeline; not specific to any experiment or dataset. | -| Manual | `dj.Manual` | Data entered from outside the pipeline, either by hand or with external helper scripts. | -| Imported | `dj.Imported` | Data ingested automatically inside the pipeline but requiring access to data outside the pipeline. | -| Computed | `dj.Computed` | Data computed automatically entirely inside the pipeline. | - -Table data tiers indicate to database administrators how valuable the data are. -Manual data are the most valuable, as re-entry may be tedious or impossible. -Computed data are safe to delete, as the data can always be recomputed from within DataJoint. -Imported data are safer than manual data but less safe than computed data because of -dependency on external data sources. -With these considerations, database administrators may opt not to back up computed -data, for example, or to back up imported data less frequently than manual data. - -The data tier of a table is specified by the superclass of its class. -For example, the User class in [definitions](declare.md) uses the `dj.Manual` -superclass. -Therefore, the corresponding User table on the database would be of the Manual tier. -Furthermore, the classes for **imported** and **computed** tables have additional -capabilities for automated processing as described in -[Auto-populate](../../compute/populate.md). - -## Internal conventions for naming tables - -On the server side, DataJoint uses a naming scheme to generate a table name -corresponding to a given class. -The naming scheme includes prefixes specifying each table's data tier. - -First, the name of the class is converted from `CamelCase` to `snake_case` -([separation by underscores](https://en.wikipedia.org/wiki/Snake_case)). -Then the name is prefixed according to the data tier. - -- `Manual` tables have no prefix. -- `Lookup` tables are prefixed with `#`. -- `Imported` tables are prefixed with `_`, a single underscore. -- `Computed` tables are prefixed with `__`, two underscores. - -For example: - -The table for the class `StructuralScan` subclassing `dj.Manual` will be named -`structural_scan`. - -The table for the class `SpatialFilter` subclassing `dj.Lookup` will be named -`#spatial_filter`. - -Again, the internal table names including prefixes are used only on the server side. -These are never visible to the user, and DataJoint users do not need to know these -conventions -However, database administrators may use these naming patterns to set backup policies -or to restrict access based on data tiers. - -## Part tables - -[Part tables](master-part.md) do not have their own tier. -Instead, they share the same tier as their master table. -The prefix for part tables also differs from the other tiers. -They are prefixed by the name of their master table, separated by two underscores. - -For example, the table for the class `Channel(dj.Part)` with the master -`Ephys(dj.Imported)` will be named `_ephys__channel`. +DataJoint assigns all tables to one of four data tiers that differentiate how +the data originate. The tier determines both the table's behavior and how it +should be treated in terms of backup and data management. + +## Table Tiers Overview + +| Tier | Superclass | Origin | Auto-populated | +|------|------------|--------|----------------| +| Lookup | `dj.Lookup` | Predefined facts and parameters | No | +| Manual | `dj.Manual` | External entry (users, scripts) | No | +| Imported | `dj.Imported` | External data sources + upstream | Yes | +| Computed | `dj.Computed` | Upstream tables only | Yes | + +## Lookup Tables + +Lookup tables store **predefined facts, parameters, and options** that are +independent of any specific experiment or dataset. Their contents are typically +defined in code alongside the table definition. + +```python +@schema +class Species(dj.Lookup): + definition = """ + species : varchar(30) + --- + species_class : enum('mammal', 'bird', 'fish', 'reptile') + typical_lifespan : smallint # years + """ + contents = [ + ('mouse', 'mammal', 3), + ('rat', 'mammal', 3), + ('zebrafish', 'fish', 5), + ('macaque', 'mammal', 30), + ] +``` + +The `contents` attribute automatically populates the table when the schema is +first activated. Use lookup tables for: + +- Species, strains, genotypes +- Experiment parameters and configurations +- Equipment and device catalogs +- Standard protocols and methods + +```python +@schema +class StimProtocol(dj.Lookup): + definition = """ + protocol_name : varchar(50) + --- + duration : float # seconds + frequency : float # Hz + amplitude : float # arbitrary units + description : varchar(255) + """ + contents = [ + ('baseline', 0, 0, 0, 'No stimulation'), + ('low_freq', 10.0, 1.0, 0.5, 'Low frequency stimulation'), + ('high_freq', 10.0, 10.0, 0.5, 'High frequency stimulation'), + ] +``` + +## Manual Tables + +Manual tables store **externally entered data** that originates outside the +DataJoint pipeline. This includes data entered by users through interfaces, +imported from external systems, or ingested from raw data files. + +```python +@schema +class Subject(dj.Manual): + definition = """ + subject_id : int # unique subject identifier + --- + species : varchar(30) + date_of_birth : date + sex : enum('M', 'F', 'U') + subject_notes='' : varchar(4000) + """ +``` + +Manual data is the **most valuable** since it cannot be regenerated from other +tables. Always ensure manual tables are backed up. Common uses: + +- Subject/animal information +- Session metadata +- User-entered annotations +- Raw data file references + +```python +@schema +class Session(dj.Manual): + definition = """ + -> Subject + session_date : date + --- + -> [nullable] User + session_notes='' : varchar(2000) + data_path='' : varchar(255) + """ +``` + +## Imported Tables + +Imported tables are **auto-populated** but require access to **external data +sources** (files, instruments, APIs) in addition to upstream DataJoint tables. +They define a `make()` method that reads external data. + +```python +@schema +class Recording(dj.Imported): + definition = """ + -> Session + recording_id : smallint + --- + duration : float # seconds + sampling_rate : float # Hz + """ + + def make(self, key): + # Read from external data files + data_path = (Session & key).fetch1('data_path') + recording_files = list_recordings(data_path) + + for i, rec_file in enumerate(recording_files): + metadata = read_recording_metadata(rec_file) + self.insert1(dict( + key, + recording_id=i, + duration=metadata['duration'], + sampling_rate=metadata['sampling_rate'] + )) +``` + +Use imported tables when data comes from: + +- Raw data files (electrophysiology, imaging) +- External databases or APIs +- Instrument outputs +- File system scans + +## Computed Tables + +Computed tables are **auto-populated** using **only upstream DataJoint tables**. +No external data sources are accessed. This makes computed data the safest to +regenerate if lost. + +```python +@schema +class FilteredSignal(dj.Computed): + definition = """ + -> Recording + --- + filtered_data : longblob + snr : float # signal-to-noise ratio + """ + + def make(self, key): + # Fetch data from upstream tables only + raw_data = (RawSignal & key).fetch1('signal') + + # Compute results + filtered = bandpass_filter(raw_data, low=1, high=100) + snr = compute_snr(filtered) + + self.insert1(dict(key, filtered_data=filtered, snr=snr)) +``` + +Computed tables are ideal for: + +- Signal processing results +- Statistical analyses +- Machine learning outputs +- Derived metrics and features + +## Auto-Population + +Imported and Computed tables support the `populate()` method: + +```python +# Populate all pending entries +FilteredSignal.populate() + +# Show progress +FilteredSignal.populate(display_progress=True) + +# Restrict to specific keys +FilteredSignal.populate(Recording & 'session_date > "2024-01-01"') + +# Distributed processing with job reservation +FilteredSignal.populate(reserve_jobs=True) +``` + +See [Populate](../../operations/populate.md) for details. + +## Choosing the Right Tier + +| Scenario | Tier | +|----------|------| +| Experiment parameters that rarely change | Lookup | +| Subject information entered by users | Manual | +| Raw data imported from files | Imported | +| Processed results from raw data | Computed | +| Derived metrics from processed data | Computed | +| External database sync | Imported | + +## Data Value and Backup + +| Tier | Data Value | Backup Priority | +|------|------------|-----------------| +| Manual | Highest (irreplaceable) | Critical | +| Imported | High (external source needed) | High | +| Computed | Lower (can regenerate) | Optional | +| Lookup | Low (defined in code) | Low | + +Database administrators use tier information to set appropriate backup policies. +Computed data can often be excluded from backups since it can be regenerated +from source tables. + +## Internal Table Naming + +DataJoint prefixes table names on the server to indicate tier: + +| Tier | Prefix | Example | +|------|--------|---------| +| Manual | (none) | `subject` | +| Lookup | `#` | `#species` | +| Imported | `_` | `_recording` | +| Computed | `__` | `__filtered_signal` | + +Users don't need to know these conventions—DataJoint handles naming automatically. diff --git a/docs/src/manipulation/insert.md b/docs/src/manipulation/insert.md deleted file mode 100644 index c64e55f17..000000000 --- a/docs/src/manipulation/insert.md +++ /dev/null @@ -1,94 +0,0 @@ -# Insert - -The `insert` method of DataJoint table objects inserts entities into the table. - -In Python there is a separate method `insert1` to insert one entity at a time. -The entity may have the form of a Python dictionary with key names matching the -attribute names in the table. - -```python -lab.Person.insert1( - dict(username='alice', - first_name='Alice', - last_name='Cooper')) -``` - -The entity also may take the form of a sequence of values in the same order as the -attributes in the table. - -```python -lab.Person.insert1(['alice', 'Alice', 'Cooper']) -``` - -Additionally, the entity may be inserted as a -[NumPy record array](https://docs.scipy.org/doc/numpy/reference/generated/numpy.record.html#numpy.record) - or [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). - -The `insert` method accepts a sequence or a generator of multiple entities and is used -to insert multiple entities at once. - -```python -lab.Person.insert([ - ['alice', 'Alice', 'Cooper'], - ['bob', 'Bob', 'Dylan'], - ['carol', 'Carol', 'Douglas']]) -``` - -Several optional parameters can be used with `insert`: - - `replace` If `True`, replaces the existing entity. - (Default `False`.) - - `skip_duplicates` If `True`, silently skip duplicate inserts. - (Default `False`.) - - `ignore_extra_fields` If `False`, fields that are not in the heading raise an error. - (Default `False`.) - - `allow_direct_insert` If `True`, allows inserts outside of populate calls. - Applies only in auto-populated tables. - (Default `None`.) - -## Batched inserts - -Inserting a set of entities in a single `insert` differs from inserting the same set of -entities one-by-one in a `for` loop in two ways: - -1. Network overhead is reduced. - Network overhead can be tens of milliseconds per query. - Inserting 1000 entities in a single `insert` call may save a few seconds over - inserting them individually. -2. The insert is performed as an all-or-nothing transaction. - If even one insert fails because it violates any constraint, then none of the - entities in the set are inserted. - -However, inserting too many entities in a single query may run against buffer size or -packet size limits of the database server. -Due to these limitations, performing inserts of very large numbers of entities should -be broken up into moderately sized batches, such as a few hundred at a time. - -## Server-side inserts - -Data inserted into a table often come from other tables already present on the database server. -In such cases, data can be [fetched](../query/fetch.md) from the first table and then -inserted into another table, but this results in transfers back and forth between the -database and the local system. -Instead, data can be inserted from one table into another without transfers between the -database and the local system using [queries](../query/principles.md). - -In the example below, a new schema has been created in preparation for phase two of a -project. -Experimental protocols from the first phase of the project will be reused in the second -phase. -Since the entities are already present on the database in the `Protocol` table of the -`phase_one` schema, we can perform a server-side insert into `phase_two.Protocol` -without fetching a local copy. - -```python -# Server-side inserts are faster... -phase_two.Protocol.insert(phase_one.Protocol) - -# ...than fetching before inserting -protocols = phase_one.Protocol.fetch() -phase_two.Protocol.insert(protocols) -``` diff --git a/docs/src/manipulation/delete.md b/docs/src/operations/delete.md similarity index 100% rename from docs/src/manipulation/delete.md rename to docs/src/operations/delete.md diff --git a/docs/src/compute/distributed.md b/docs/src/operations/distributed.md similarity index 100% rename from docs/src/compute/distributed.md rename to docs/src/operations/distributed.md diff --git a/docs/src/manipulation/index.md b/docs/src/operations/index.md similarity index 70% rename from docs/src/manipulation/index.md rename to docs/src/operations/index.md index 295195778..b39e3de14 100644 --- a/docs/src/manipulation/index.md +++ b/docs/src/operations/index.md @@ -5,5 +5,4 @@ without modifying the structure of the stored data. These operations include [insert](insert.md), [delete](delete.md), and [update](update.md). -Data manipulation operations in DataJoint respect the -[integrity](../design/integrity.md) constraints. +Data manipulation operations in DataJoint respect integrity constraints. diff --git a/docs/src/operations/insert.md b/docs/src/operations/insert.md new file mode 100644 index 000000000..a3dd0f1bd --- /dev/null +++ b/docs/src/operations/insert.md @@ -0,0 +1,327 @@ +# Insert + +The `insert` operation adds new entities to tables. It is the primary way data +enters a DataJoint pipeline from external sources. + +## Single Entity: insert1 + +Use `insert1` to insert one entity at a time: + +```python +# Insert as dictionary (recommended) +Subject.insert1({ + 'subject_id': 1, + 'species': 'mouse', + 'date_of_birth': '2023-06-15', + 'sex': 'M' +}) + +# Insert as ordered sequence (matches attribute order) +Subject.insert1([1, 'mouse', '2023-06-15', 'M']) + +# Insert with dict() constructor +Subject.insert1(dict( + subject_id=1, + species='mouse', + date_of_birth='2023-06-15', + sex='M' +)) +``` + +Dictionary format is recommended because it's explicit and doesn't depend on +attribute order. + +## Multiple Entities: insert + +Use `insert` for batch operations with a list of entities: + +```python +# Insert multiple entities +Subject.insert([ + {'subject_id': 1, 'species': 'mouse', 'date_of_birth': '2023-01-15', 'sex': 'M'}, + {'subject_id': 2, 'species': 'mouse', 'date_of_birth': '2023-02-20', 'sex': 'F'}, + {'subject_id': 3, 'species': 'rat', 'date_of_birth': '2023-03-10', 'sex': 'M'}, +]) + +# Insert from generator (memory efficient) +def generate_subjects(): + for i in range(1000): + yield {'subject_id': i, 'species': 'mouse', + 'date_of_birth': '2023-01-01', 'sex': 'U'} + +Subject.insert(generate_subjects()) + +# Insert from pandas DataFrame +import pandas as pd +df = pd.DataFrame({ + 'subject_id': [1, 2, 3], + 'species': ['mouse', 'mouse', 'rat'], + 'date_of_birth': ['2023-01-15', '2023-02-20', '2023-03-10'], + 'sex': ['M', 'F', 'M'] +}) +Subject.insert(df) + +# Insert from numpy record array +import numpy as np +data = np.array([ + (1, 'mouse', '2023-01-15', 'M'), + (2, 'mouse', '2023-02-20', 'F'), +], dtype=[('subject_id', 'i4'), ('species', 'U30'), + ('date_of_birth', 'U10'), ('sex', 'U1')]) +Subject.insert(data) +``` + +## Insert Options + +### skip_duplicates + +Silently skip entities with existing primary keys: + +```python +# Insert new subjects, skip if already exists +Subject.insert(subjects, skip_duplicates=True) +``` + +Use for idempotent scripts that can safely be re-run. + +### ignore_extra_fields + +Ignore dictionary keys that don't match table attributes: + +```python +# External data with extra fields +external_data = { + 'subject_id': 1, + 'species': 'mouse', + 'date_of_birth': '2023-01-15', + 'sex': 'M', + 'extra_field': 'ignored', # not in table + 'another_field': 123 # not in table +} +Subject.insert1(external_data, ignore_extra_fields=True) +``` + +### replace + +Replace existing entities with matching primary keys: + +```python +# Update subject if exists, insert if new +Subject.insert1({ + 'subject_id': 1, + 'species': 'mouse', + 'date_of_birth': '2023-01-15', + 'sex': 'F' # corrected value +}, replace=True) +``` + +**Warning**: Use `replace` carefully. It circumvents DataJoint's data integrity +model. Prefer delete-and-insert for most corrections. + +### allow_direct_insert + +Allow inserts into auto-populated tables outside of `make()`: + +```python +# Normally auto-populated tables only allow inserts in make() +# This overrides that restriction +ComputedTable.insert1(data, allow_direct_insert=True) +``` + +Use sparingly, typically for data migration or recovery. + +## Batch Insert Behavior + +Batched inserts differ from individual inserts: + +1. **Reduced network overhead**: One round-trip instead of many +2. **Atomic transaction**: All-or-nothing (if one fails, none are inserted) + +```python +# Efficient: single transaction +Subject.insert([entity1, entity2, entity3]) # ~10ms total + +# Less efficient: multiple transactions +for entity in [entity1, entity2, entity3]: + Subject.insert1(entity) # ~10ms each = ~30ms total +``` + +For very large batches, break into chunks to avoid buffer limits: + +```python +def chunked_insert(table, entities, chunk_size=500): + """Insert entities in chunks.""" + chunk = [] + for entity in entities: + chunk.append(entity) + if len(chunk) >= chunk_size: + table.insert(chunk, skip_duplicates=True) + chunk = [] + if chunk: + table.insert(chunk, skip_duplicates=True) + +chunked_insert(Subject, large_entity_list) +``` + +## Server-Side Insert + +Insert data from one table to another without local transfer: + +```python +# Server-side: data never leaves the database +TargetTable.insert(SourceTable & 'condition="value"') + +# Equivalent but slower: fetch then insert +data = (SourceTable & 'condition="value"').fetch() +TargetTable.insert(data) +``` + +Server-side inserts are efficient for: +- Copying between schemas +- Populating from query results +- Data migration + +```python +# Copy all protocols from phase 1 to phase 2 +phase2.Protocol.insert(phase1.Protocol) + +# Copy subset with projection +phase2.Summary.insert( + phase1.Experiment.proj('experiment_id', 'start_date') + & 'start_date > "2024-01-01"' +) +``` + +## Referential Integrity + +Inserts must satisfy foreign key constraints: + +```python +# Subject must exist before Session can reference it +Subject.insert1({'subject_id': 1, 'species': 'mouse', ...}) +Session.insert1({'subject_id': 1, 'session_date': '2024-01-15', ...}) + +# This fails - subject_id=999 doesn't exist +Session.insert1({'subject_id': 999, 'session_date': '2024-01-15'}) +# IntegrityError: foreign key constraint fails +``` + +## Object Attributes + +Tables with [`object`](../datatypes/object.md) type attributes accept various input formats: + +```python +@schema +class Recording(dj.Manual): + definition = """ + recording_id : int + --- + raw_data : + """ + +# Insert from local file +Recording.insert1({ + 'recording_id': 1, + 'raw_data': '/local/path/to/data.dat' +}) + +# Insert from local folder +Recording.insert1({ + 'recording_id': 2, + 'raw_data': '/local/path/to/data_folder/' +}) + +# Insert from remote URL (S3, GCS, Azure, HTTP) +Recording.insert1({ + 'recording_id': 3, + 'raw_data': 's3://bucket/path/to/data.dat' +}) + +# Insert from stream with extension +with open('/path/to/data.bin', 'rb') as f: + Recording.insert1({ + 'recording_id': 4, + 'raw_data': ('.bin', f) + }) +``` + +### Staged Inserts + +For large objects (Zarr arrays, HDF5), write directly to storage: + +```python +import zarr + +with Recording.staged_insert1 as staged: + # Set key values + staged.rec['recording_id'] = 5 + + # Create Zarr array directly in object storage + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', + shape=(10000, 10000), dtype='f4') + z[:] = compute_large_array() + + # Assign to record + staged.rec['raw_data'] = z + +# On success: metadata computed, record inserted +# On exception: storage cleaned up, nothing inserted +``` + +## Common Patterns + +### Ingestion Script + +```python +def ingest_subjects(csv_file): + """Ingest subjects from CSV file.""" + import pandas as pd + df = pd.read_csv(csv_file) + + # Validate and transform + df['date_of_birth'] = pd.to_datetime(df['date_of_birth']).dt.date + df['sex'] = df['sex'].str.upper() + + # Insert with conflict handling + Subject.insert(df.to_dict('records'), + skip_duplicates=True, + ignore_extra_fields=True) +``` + +### Conditional Insert + +```python +def insert_if_missing(table, entity): + """Insert entity only if not already present.""" + key = {k: entity[k] for k in table.primary_key} + if not (table & key): + table.insert1(entity) +``` + +### Insert with Default Values + +```python +# Table with defaults +@schema +class Experiment(dj.Manual): + definition = """ + experiment_id : int + --- + notes='' : varchar(2000) + status='pending' : enum('pending', 'running', 'complete') + created=CURRENT_TIMESTAMP : timestamp + """ + +# Defaults are applied automatically +Experiment.insert1({'experiment_id': 1}) +# Result: notes='', status='pending', created= +``` + +## Best Practices + +1. **Use dictionaries**: Explicit attribute names prevent ordering errors +2. **Batch when possible**: Reduce network overhead with multi-entity inserts +3. **Use skip_duplicates for idempotency**: Safe to re-run scripts +4. **Validate before insert**: Check data quality before committing +5. **Handle errors gracefully**: Wrap inserts in try/except for production code +6. **Use server-side inserts**: When copying between tables diff --git a/docs/src/operations/jobs.md b/docs/src/operations/jobs.md new file mode 100644 index 000000000..44a019765 --- /dev/null +++ b/docs/src/operations/jobs.md @@ -0,0 +1,218 @@ +# Job Management + +DataJoint provides a job reservation system for coordinating distributed `populate()` +operations across multiple workers. Each auto-populated table (`dj.Imported` or +`dj.Computed`) has an associated hidden jobs table that tracks processing status. + +## Overview + +The jobs system enables: + +- **Distributed computing**: Multiple workers can process the same table without conflicts +- **Progress tracking**: Monitor pending, reserved, completed, and failed jobs +- **Error management**: Track and retry failed computations +- **Priority scheduling**: Process urgent jobs first + +## Accessing the Jobs Table + +Every auto-populated table has a `.jobs` attribute: + +```python +@schema +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + result : float + """ + + def make(self, key): + # computation logic + self.insert1(dict(key, result=compute(key))) + +# Access the jobs table +ProcessedData.jobs +``` + +## Job States + +Jobs can be in one of five states: + +| Status | Description | +|--------|-------------| +| `pending` | Queued and ready for processing | +| `reserved` | Currently being processed by a worker | +| `success` | Completed successfully | +| `error` | Failed with an error | +| `ignore` | Manually marked to skip | + +## Refreshing the Job Queue + +The `refresh()` method updates the jobs queue by adding new jobs and removing stale ones: + +```python +# Add jobs for all missing keys +ProcessedData.jobs.refresh() + +# Add jobs for specific restrictions +ProcessedData.jobs.refresh("subject_id > 10") + +# Set priority (lower = more urgent, default: 5) +ProcessedData.jobs.refresh(priority=1) + +# Delay job availability by 60 seconds +ProcessedData.jobs.refresh(delay=60) +``` + +**Returns**: `{'added': int, 'removed': int}` - counts of jobs added and stale jobs removed. + +### Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `restrictions` | None | Filter conditions for key_source | +| `delay` | 0 | Seconds until jobs become available | +| `priority` | 5 | Job priority (lower = more urgent) | +| `stale_timeout` | 3600 | Seconds before checking pending jobs for staleness | + +## Querying Job Status + +### Filter by Status + +```python +# Pending jobs +ProcessedData.jobs.pending + +# Reserved (in-progress) jobs +ProcessedData.jobs.reserved + +# Completed jobs +ProcessedData.jobs.completed + +# Failed jobs +ProcessedData.jobs.errors + +# Ignored jobs +ProcessedData.jobs.ignored +``` + +### Progress Summary + +```python +ProcessedData.jobs.progress() +# Returns: {'pending': 50, 'reserved': 2, 'success': 100, 'error': 3, 'ignore': 1, 'total': 156} +``` + +### Fetch Pending Jobs + +```python +# Get up to 10 highest-priority pending jobs +keys = ProcessedData.jobs.fetch_pending(limit=10) + +# Get pending jobs at priority 3 or higher (lower number) +keys = ProcessedData.jobs.fetch_pending(priority=3) +``` + +## Managing Jobs + +### Mark Keys to Ignore + +Skip specific keys during populate: + +```python +ProcessedData.jobs.ignore({"subject_id": 5, "session_id": 3}) +``` + +### Clear Jobs + +```python +# Delete all jobs +ProcessedData.jobs.delete() + +# Delete specific jobs +(ProcessedData.jobs & "status='error'").delete() + +# Drop the entire jobs table +ProcessedData.jobs.drop() +``` + +### View Error Details + +```python +# View error messages +ProcessedData.jobs.errors.fetch("KEY", "error_message") + +# Get full error traceback +error_job = (ProcessedData.jobs.errors & key).fetch1() +print(error_job["error_stack"]) +``` + +## Configuration + +Configure job behavior in `datajoint.json`: + +```json +{ + "jobs": { + "default_priority": 5, + "stale_timeout": 3600, + "keep_completed": false + } +} +``` + +| Setting | Default | Description | +|---------|---------|-------------| +| `jobs.default_priority` | 5 | Default priority for new jobs | +| `jobs.stale_timeout` | 3600 | Seconds before pending jobs are checked for staleness | +| `jobs.keep_completed` | false | Keep job records after successful completion | + +## Jobs Table Schema + +The jobs table stores: + +| Attribute | Type | Description | +|-----------|------|-------------| +| *primary key* | (varies) | FK-derived primary key from target table | +| `status` | enum | pending, reserved, success, error, ignore | +| `priority` | int | Lower = more urgent | +| `created_time` | datetime | When job was added | +| `scheduled_time` | datetime | Process on or after this time | +| `reserved_time` | datetime | When job was reserved | +| `completed_time` | datetime | When job completed | +| `duration` | float | Execution duration in seconds | +| `error_message` | varchar | Error message if failed | +| `error_stack` | blob | Full error traceback | +| `user` | varchar | Database user | +| `host` | varchar | Worker hostname | +| `pid` | int | Worker process ID | +| `connection_id` | bigint | MySQL connection ID | + +## Distributed Processing Example + +Run multiple workers to process a table in parallel: + +```python +# Worker script (run on multiple machines) +import datajoint as dj + +schema = dj.Schema('my_pipeline') + +@schema +class Analysis(dj.Computed): + definition = """ + -> Experiment + --- + result : float + """ + + def make(self, key): + data = (Experiment & key).fetch1('data') + self.insert1(dict(key, result=analyze(data))) + +# Each worker runs: +Analysis.populate(reserve_jobs=True) +``` + +With `reserve_jobs=True`, workers coordinate through the jobs table to avoid +processing the same key twice. diff --git a/docs/src/compute/key-source.md b/docs/src/operations/key-source.md similarity index 98% rename from docs/src/compute/key-source.md rename to docs/src/operations/key-source.md index 76796ec0c..c9b5d2ce7 100644 --- a/docs/src/compute/key-source.md +++ b/docs/src/operations/key-source.md @@ -45,7 +45,7 @@ definition = """ -> Recording --- sample_rate : float -eeg_data : longblob +eeg_data : """ key_source = Recording & 'recording_type = "EEG"' ``` diff --git a/docs/src/compute/make.md b/docs/src/operations/make.md similarity index 96% rename from docs/src/compute/make.md rename to docs/src/operations/make.md index 1b5569b65..138cee1b6 100644 --- a/docs/src/compute/make.md +++ b/docs/src/operations/make.md @@ -1,6 +1,6 @@ # Transactions in Make -Each call of the [make](../compute/make.md) method is enclosed in a transaction. +Each call of the `make` method is enclosed in a transaction. DataJoint users do not need to explicitly manage transactions but must be aware of their use. @@ -16,8 +16,7 @@ become visible to other processes until the `make` call completes execution. If the `make` method raises an exception, all changes made so far will be discarded and will never become visible to other processes. -Transactions are particularly important in maintaining -[group integrity](../design/integrity.md#group-integrity) with +Transactions are particularly important in maintaining group integrity with [master-part relationships](../design/tables/master-part.md). The `make` call of a master table first inserts the master entity and then inserts all the matching part entities in the part tables. @@ -152,7 +151,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ @@ -188,7 +187,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ diff --git a/docs/src/operations/populate.md b/docs/src/operations/populate.md new file mode 100644 index 000000000..c9ab5bd8b --- /dev/null +++ b/docs/src/operations/populate.md @@ -0,0 +1,378 @@ +# Auto-populate + +Auto-populated tables (`dj.Imported` and `dj.Computed`) automatically compute and +insert their data based on upstream tables. They define a `make()` method that +specifies how to compute each entry. + +## Defining Auto-populated Tables + +### Basic Structure + +```python +@schema +class Analysis(dj.Computed): + definition = """ + -> Recording + --- + mean_value : float + std_value : float + """ + + def make(self, key): + # 1. Fetch data from upstream tables + data = (Recording & key).fetch1('data') + + # 2. Compute results + result = dict( + key, + mean_value=data.mean(), + std_value=data.std() + ) + + # 3. Insert into self + self.insert1(result) +``` + +### Imported vs Computed + +```python +# Use Imported when accessing external files +@schema +class RawData(dj.Imported): + definition = """ + -> Session + --- + data : longblob + """ + + def make(self, key): + # Access external file system + filepath = (Session & key).fetch1('data_path') + data = load_from_file(filepath) + self.insert1(dict(key, data=data)) + +# Use Computed when only using upstream tables +@schema +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + processed : longblob + """ + + def make(self, key): + # Only access DataJoint tables + raw = (RawData & key).fetch1('data') + self.insert1(dict(key, processed=process(raw))) +``` + +## The make() Method + +The `make(self, key)` method receives a primary key dictionary and must: + +1. **Fetch** data from upstream tables using `key` for restriction +2. **Compute** the results +3. **Insert** into `self` + +```python +def make(self, key): + # key contains primary key values, e.g., {'subject_id': 1, 'session_date': '2024-01-15'} + + # Fetch upstream data + raw_data = (RawData & key).fetch1('data') + params = (ProcessingParams & key).fetch1() + + # Compute + result = analyze(raw_data, **params) + + # Insert - add computed values to key + self.insert1(dict(key, result=result)) +``` + +### Multiple Inserts per make() + +When a table adds dimensions to the primary key: + +```python +@schema +class TrialAnalysis(dj.Computed): + definition = """ + -> Session + trial_num : int + --- + metric : float + """ + + def make(self, key): + # key only has session info, we generate trial_num + trials = (Trial & key).fetch(as_dict=True) + + for trial in trials: + metric = compute_metric(trial) + self.insert1(dict(key, trial_num=trial['trial_num'], metric=metric)) +``` + +### Master-Part Pattern + +For tables with part tables: + +```python +@schema +class Segmentation(dj.Computed): + definition = """ + -> Image + --- + num_cells : int + """ + + class Cell(dj.Part): + definition = """ + -> master + cell_id : int + --- + center_x : float + center_y : float + area : float + """ + + def make(self, key): + image = (Image & key).fetch1('pixels') + cells = segment_image(image) + + # Insert master + self.insert1(dict(key, num_cells=len(cells))) + + # Insert parts + self.Cell.insert([ + dict(key, cell_id=i, **cell) + for i, cell in enumerate(cells) + ]) +``` + +## Running populate() + +### Basic Usage + +```python +# Populate all missing entries +Analysis.populate() + +# Show progress bar +Analysis.populate(display_progress=True) + +# Restrict to specific keys +Analysis.populate(Recording & 'session_date > "2024-01-01"') +``` + +### Populate Options + +| Option | Default | Description | +|--------|---------|-------------| +| `restrictions` | None | Restrict which keys to populate | +| `display_progress` | False | Show progress bar | +| `limit` | None | Maximum keys to check | +| `max_calls` | None | Maximum make() calls | +| `order` | 'original' | Order: 'original', 'reverse', 'random' | +| `suppress_errors` | False | Continue on errors | +| `reserve_jobs` | False | Enable distributed job reservation | + +```python +# Populate with options +Analysis.populate( + restrictions='subject_id < 100', + display_progress=True, + max_calls=50, + order='random', + suppress_errors=True, + reserve_jobs=True +) +``` + +### Check Progress + +```python +# Print progress summary +Analysis.progress() +# Output: Analysis: 150/200 (75.0%) + +# Get counts without printing +done, total = Analysis.progress(display=False) + +# Progress for restricted subset +Analysis.progress('subject_id < 10') +``` + +## Distributed Processing + +For parallel processing across multiple workers, use job reservation: + +```python +# Worker 1 +Analysis.populate(reserve_jobs=True) + +# Worker 2 (different machine/process) +Analysis.populate(reserve_jobs=True) +``` + +Each worker reserves keys before processing, preventing duplicates. +See [Jobs](jobs.md) for detailed job management. + +## Error Handling + +### Suppress and Log Errors + +```python +# Continue processing despite errors +errors = Analysis.populate( + suppress_errors=True, + reserve_jobs=True +) + +# errors contains list of error messages +for error in errors: + print(error) + +# Get exception objects instead +exceptions = Analysis.populate( + suppress_errors=True, + return_exception_objects=True +) +``` + +### View Failed Jobs + +```python +# Access jobs table +schema.jobs + +# View errors +(schema.jobs & 'status="error"').fetch() + +# Retry failed jobs +(schema.jobs & 'status="error"').delete() +Analysis.populate(reserve_jobs=True) +``` + +## Three-Part Make Pattern + +For long-running computations, split `make()` into three phases to minimize +database lock time: + +```python +@schema +class LongAnalysis(dj.Computed): + definition = """ + -> Recording + --- + result : longblob + duration : float + """ + + def make_fetch(self, key): + """Phase 1: Fetch data (short transaction)""" + data = (Recording & key).fetch1('data') + return (data,) # Must return tuple/list + + def make_compute(self, key, data): + """Phase 2: Compute (no transaction - can take hours)""" + import time + start = time.time() + result = expensive_analysis(data) + duration = time.time() - start + return (result, duration) # Must return tuple/list + + def make_insert(self, key, result, duration): + """Phase 3: Insert (short transaction)""" + self.insert1(dict(key, result=result, duration=duration)) +``` + +### How It Works + +1. `make_fetch()` runs in a short transaction to get data +2. `make_compute()` runs outside any transaction (can take hours) +3. Before `make_insert()`, data is re-fetched and verified unchanged +4. `make_insert()` runs in a short transaction + +This prevents long-held database locks during expensive computations. + +### Generator Pattern (Alternative) + +```python +def make(self, key): + # Fetch + data = (Recording & key).fetch1('data') + computed = yield (data,) # Yield fetched data + + if computed is None: + # Compute (outside transaction) + result = expensive_analysis(data) + computed = yield (result,) + + # Insert + self.insert1(dict(key, result=computed[0])) + yield # Signal completion +``` + +## Common Patterns + +### Conditional Computation + +```python +def make(self, key): + params = (Params & key).fetch1() + + if params['method'] == 'fast': + result = fast_analysis(key) + else: + result = thorough_analysis(key) + + self.insert1(dict(key, result=result)) +``` + +### Skip Invalid Keys + +```python +def make(self, key): + data = (Recording & key).fetch1('data') + + if not is_valid(data): + # Insert placeholder or skip + self.insert1(dict(key, result=None, valid=False)) + return + + result = analyze(data) + self.insert1(dict(key, result=result, valid=True)) +``` + +### External Tool Integration + +```python +def make(self, key): + import subprocess + + # Export data + data = (Recording & key).fetch1('data') + input_file = f'/tmp/input_{key["recording_id"]}.dat' + save_data(data, input_file) + + # Run external tool + output_file = f'/tmp/output_{key["recording_id"]}.dat' + subprocess.run(['analyze', input_file, '-o', output_file]) + + # Import results + result = load_data(output_file) + self.insert1(dict(key, result=result)) + + # Cleanup + os.remove(input_file) + os.remove(output_file) +``` + +## Best Practices + +1. **Keep make() idempotent**: Same input should produce same output +2. **Use transactions wisely**: Long computations outside transactions +3. **Handle errors gracefully**: Use `suppress_errors` for batch processing +4. **Monitor progress**: Use `display_progress=True` for long jobs +5. **Distribute work**: Use `reserve_jobs=True` for parallel processing +6. **Clean up resources**: Remove temporary files after processing diff --git a/docs/src/manipulation/transactions.md b/docs/src/operations/transactions.md similarity index 100% rename from docs/src/manipulation/transactions.md rename to docs/src/operations/transactions.md diff --git a/docs/src/manipulation/update.md b/docs/src/operations/update.md similarity index 87% rename from docs/src/manipulation/update.md rename to docs/src/operations/update.md index 7faa7cb87..86bfce1f2 100644 --- a/docs/src/manipulation/update.md +++ b/docs/src/operations/update.md @@ -3,18 +3,16 @@ In database programming, the **update** operation refers to modifying the values of individual attributes in an entity within a table without replacing the entire entity. Such an in-place update mechanism is not part of DataJoint's data manipulation model, -because it circumvents data -[dependency constraints](../design/integrity.md#referential-integrity). +because it circumvents data dependency constraints. This is not to say that data cannot be changed once they are part of a pipeline. In DataJoint, data is changed by replacing entire entities rather than by updating the values of their attributes. The process of deleting existing entities and inserting new entities with corrected -values ensures the [integrity](../design/integrity.md) of the data throughout the -pipeline. +values ensures the integrity of the data throughout the pipeline. This approach applies specifically to automated tables -(see [Auto-populated tables](../compute/populate.md)). +(see [Auto-populated tables](populate.md)). However, manual tables are often edited outside DataJoint through other interfaces. It is up to the user's discretion to allow updates in manual tables, and the user must be cognizant of the fact that updates will not trigger re-computation of dependent data. diff --git a/docs/src/query/fetch.md b/docs/src/query/fetch.md index 105d70084..e9197550d 100644 --- a/docs/src/query/fetch.md +++ b/docs/src/query/fetch.md @@ -1,126 +1,325 @@ # Fetch -Data queries in DataJoint comprise two distinct steps: +The `fetch` operation retrieves data from query results into Python. It's the +second step after constructing a query with [operators](operators.md). -1. Construct the `query` object to represent the required data using tables and -[operators](operators.md). -2. Fetch the data from `query` into the workspace of the host language -- described in -this section. +## Basic Fetch -Note that entities returned by `fetch` methods are not guaranteed to be sorted in any -particular order unless specifically requested. -Furthermore, the order is not guaranteed to be the same in any two queries, and the -contents of two identical queries may change between two sequential invocations unless -they are wrapped in a transaction. -Therefore, if you wish to fetch matching pairs of attributes, do so in one `fetch` call. +### Fetch All Entities -The examples below are based on the [example schema](example-schema.md) for this part -of the documentation. +```python +# As NumPy recarray (default) +data = Subject.fetch() + +# As list of dictionaries +data = Subject.fetch(as_dict=True) + +# As pandas DataFrame +data = Subject.fetch(format='frame') +``` + +### Fetch Single Entity + +Use `fetch1` when the query returns exactly one entity: + +```python +# Fetch entire entity +subject = (Subject & 'subject_id=1').fetch1() +# Returns: {'subject_id': 1, 'species': 'mouse', ...} + +# Raises error if zero or multiple entities match +``` + +### Fetch Specific Attributes + +```python +# Single attribute returns 1D array +names = Subject.fetch('species') +# Returns: array(['mouse', 'mouse', 'rat', ...]) + +# Multiple attributes return tuple of arrays +ids, species = Subject.fetch('subject_id', 'species') + +# With fetch1, returns scalar values +subject_id, species = (Subject & 'subject_id=1').fetch1('subject_id', 'species') +# Returns: (1, 'mouse') +``` + +### Fetch Primary Keys + +```python +# List of key dictionaries +keys = Subject.fetch('KEY') +# Returns: [{'subject_id': 1}, {'subject_id': 2}, ...] + +# Single key +key = (Subject & 'subject_id=1').fetch1('KEY') +# Returns: {'subject_id': 1} +``` + +## Output Formats + +### NumPy Recarray (Default) + +```python +data = Subject.fetch() +# Access attributes by name +data['subject_id'] +data['species'] + +# Iterate over entities +for entity in data: + print(entity['subject_id'], entity['species']) +``` + +### List of Dictionaries + +```python +data = Subject.fetch(as_dict=True) +# [{'subject_id': 1, 'species': 'mouse', ...}, ...] + +for entity in data: + print(entity['subject_id']) +``` + +### Pandas DataFrame + +```python +df = Subject.fetch(format='frame') +# DataFrame indexed by primary key + +# Query on the DataFrame +df[df['species'] == 'mouse'] +df.groupby('sex').count() +``` + +## Sorting and Limiting + +### Order By + +```python +# Ascending (default) +data = Subject.fetch(order_by='date_of_birth') -## Entire table +# Descending +data = Subject.fetch(order_by='date_of_birth desc') -The following statement retrieves the entire table as a NumPy -[recarray](https://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html). +# Multiple attributes +data = Subject.fetch(order_by=('species', 'date_of_birth desc')) + +# By primary key +data = Subject.fetch(order_by='KEY') + +# SQL reserved words require backticks +data = Table.fetch(order_by='`select` desc') +``` + +### Limit and Offset ```python -data = query.fetch() +# First 10 entities +data = Subject.fetch(limit=10) + +# Entities 11-20 (skip first 10) +data = Subject.fetch(limit=10, offset=10) + +# Most recent 5 subjects +data = Subject.fetch(order_by='date_of_birth desc', limit=5) ``` -To retrieve the data as a list of `dict`: +**Note**: `offset` requires `limit` to be specified. + +## Practical Examples + +### Query and Filter ```python -data = query.fetch(as_dict=True) +# Fetch subjects of a specific species +mice = (Subject & 'species="mouse"').fetch() + +# Fetch with complex restriction +recent_mice = (Subject & 'species="mouse"' + & 'date_of_birth > "2023-01-01"').fetch(as_dict=True) ``` -In some cases, the amount of data returned by fetch can be quite large; in these cases -it can be useful to use the `size_on_disk` attribute to determine if running a bare -fetch would be wise. -Please note that it is only currently possible to query the size of entire tables -stored directly in the database at this time. +### Fetch with Projection + +```python +# Fetch only specific attributes +data = Subject.proj('species', 'sex').fetch() + +# Rename attributes +data = Subject.proj(animal_species='species').fetch() +``` -## As separate variables +### Fetch from Joins ```python -name, img = query.fetch1('name', 'image') # when query has exactly one entity -name, img = query.fetch('name', 'image') # [name, ...] [image, ...] +# Fetch combined data from multiple tables +data = (Session * Subject).fetch() + +# Select attributes from join +ids, dates, species = (Session * Subject).fetch( + 'session_id', 'session_date', 'species' +) ``` -## Primary key values +### Aggregation Results ```python -keydict = tab.fetch1("KEY") # single key dict when tab has exactly one entity -keylist = tab.fetch("KEY") # list of key dictionaries [{}, ...] +# Count sessions per subject +session_counts = (Subject.aggr(Session, count='count(*)')).fetch() + +# Average duration per subject +avg_durations = (Subject.aggr(Trial, avg_dur='avg(duration)')).fetch() ``` -`KEY` can also used when returning attribute values as separate variables, such that -one of the returned variables contains the entire primary keys. +## Working with Blobs -## Sorting and limiting the results +Blob attributes contain serialized Python objects: -To sort the result, use the `order_by` keyword argument. +```python +@schema +class Image(dj.Manual): + definition = """ + image_id : int + --- + pixels : longblob # numpy array + metadata : longblob # dict + """ + +# Fetch returns deserialized objects +image = (Image & 'image_id=1').fetch1() +pixels = image['pixels'] # numpy array +metadata = image['metadata'] # dict + +# Fetch specific blob attribute +pixels = (Image & 'image_id=1').fetch1('pixels') +``` + +## Object Attributes + +[Object](../datatypes/object.md) attributes return `ObjectRef` handles for +efficient access to large files: ```python -# ascending order: -data = query.fetch(order_by='name') -# descending order: -data = query.fetch(order_by='name desc') -# by name first, year second: -data = query.fetch(order_by=('name desc', 'year')) -# sort by the primary key: -data = query.fetch(order_by='KEY') -# sort by name but for same names order by primary key: -data = query.fetch(order_by=('name', 'KEY desc')) +record = Recording.fetch1() +obj = record['raw_data'] + +# Metadata (no I/O) +print(obj.path) # Storage path +print(obj.size) # Size in bytes +print(obj.checksum) # Content hash +print(obj.is_dir) # True if folder + +# Read content +content = obj.read() # Returns bytes + +# Open as file +with obj.open() as f: + data = f.read() + +# Download locally +local_path = obj.download('/local/destination/') ``` -The `order_by` argument can be a string specifying the attribute to sort by. By default -the sort is in ascending order. Use `'attr desc'` to sort in descending order by -attribute `attr`. The value can also be a sequence of strings, in which case, the sort -performed on all the attributes jointly in the order specified. +### Zarr and Xarray Integration + +```python +import zarr +import xarray as xr + +obj = Recording.fetch1()['neural_data'] + +# Open as Zarr +z = zarr.open(obj.store, mode='r') +data = z[:] -The special attribute name `'KEY'` represents the primary key attributes in order that -they appear in the index. Otherwise, this name can be used as any other argument. +# Open with xarray +ds = xr.open_zarr(obj.store) +``` -If an attribute happens to be a SQL reserved word, it needs to be enclosed in -backquotes. For example: +## Performance Considerations + +### Check Size Before Fetching ```python -data = query.fetch(order_by='`select` desc') +# Check table size before fetch +print(f"Table size: {Subject.size_on_disk / 1e6:.2f} MB") +print(f"Entity count: {len(Subject)}") ``` -The `order_by` value is eventually passed to the `ORDER BY` -[clause](https://dev.mysql.com/doc/refman/5.7/en/order-by-optimization.html). +### Stream Large Results -Similarly, the `limit` and `offset` arguments can be used to limit the result to a -subset of entities. +```python +# Process entities one at a time (memory efficient) +for entity in Subject.fetch(as_dict=True): + process(entity) + +# Or with a cursor +for key in Subject.fetch('KEY'): + entity = (Subject & key).fetch1() + process(entity) +``` -For example, one could do the following: +### Fetch Only What You Need ```python -data = query.fetch(order_by='name', limit=10, offset=5) +# Bad: fetch everything, use only ID +all_data = Subject.fetch() +ids = all_data['subject_id'] + +# Good: fetch only needed attribute +ids = Subject.fetch('subject_id') ``` -Note that an `offset` cannot be used without specifying a `limit` as well. +## Common Patterns + +### Conditional Fetch -## Usage with Pandas +```python +def get_subject(subject_id): + """Fetch subject if exists, else None.""" + query = Subject & {'subject_id': subject_id} + if query: + return query.fetch1() + return None +``` -The [pandas library](http://pandas.pydata.org/) is a popular library for data analysis -in Python which can easily be used with DataJoint query results. -Since the records returned by `fetch()` are contained within a `numpy.recarray`, they -can be easily converted to `pandas.DataFrame` objects by passing them into the -`pandas.DataFrame` constructor. -For example: +### Fetch with Defaults ```python -import pandas as pd -frame = pd.DataFrame(tab.fetch()) +def fetch_with_default(query, attribute, default=None): + """Fetch attribute with default value.""" + try: + return query.fetch1(attribute) + except DataJointError: + return default ``` -Calling `fetch()` with the argument `format="frame"` returns results as -`pandas.DataFrame` objects indexed by the table's primary key attributes. +### Batch Processing ```python -frame = tab.fetch(format="frame") +def process_in_batches(table, batch_size=100): + """Process table in batches.""" + keys = table.fetch('KEY') + for i in range(0, len(keys), batch_size): + batch_keys = keys[i:i + batch_size] + batch_data = (table & batch_keys).fetch(as_dict=True) + yield batch_data ``` -Returning results as a `DataFrame` is not possible when fetching a particular subset of -attributes or when `as_dict` is set to `True`. +## Entity Ordering Note + +Fetch results are **not guaranteed to be in any particular order** unless +`order_by` is specified. The order may vary between queries. If you need +matching pairs of attributes, fetch them in a single call: + +```python +# Correct: attributes are matched +ids, names = Subject.fetch('subject_id', 'species') + +# Risky: separate fetches may return different orders +ids = Subject.fetch('subject_id') +names = Subject.fetch('species') # May not match ids! +``` diff --git a/docs/src/query/operators.md b/docs/src/query/operators.md index ee3549f35..0f8f5c1ae 100644 --- a/docs/src/query/operators.md +++ b/docs/src/query/operators.md @@ -33,7 +33,7 @@ languages to simplify and enhance the construction and interpretation of precise efficient data queries. 1. **Entity integrity**: Data are represented and manipulated in the form of tables -representing [well-formed entity sets](../design/integrity.md). +representing well-formed entity sets. This applies to the inputs and outputs of query operators. The output of a query operator is an entity set with a well-defined entity type, a primary key, unique attribute names, etc. @@ -155,8 +155,8 @@ and others. The result of the union operator `A + B` contains all the entities from both operands. -[Entity normalization](../design/normalization) requires that `A` and `B` are of the same type, -with with the same [primary key](../concepts/glossary#primary-key), using homologous +Entity normalization requires that `A` and `B` are of the same type, +with the same primary key, using homologous attributes. Without secondary attributes, the result is the simple set union. With secondary attributes, they must have the same names and datatypes. The two operands must also be **disjoint**, without any duplicate primary key values across both inputs. diff --git a/docs/src/query/principles.md b/docs/src/query/principles.md index 9b9fd284d..9caaf9427 100644 --- a/docs/src/query/principles.md +++ b/docs/src/query/principles.md @@ -72,7 +72,7 @@ n = len(Session & 'session_date >= "2018-01-01"') ## Normalization in queries -Query objects adhere to entity [entity normalization](../design/normalization.md) just +Query objects adhere to entity normalization just like the stored tables do. The result of a query is a well-defined entity set with an readily identifiable entity class and designated primary attributes that jointly distinguish any two entities from diff --git a/docs/src/query/restrict.md b/docs/src/query/restrict.md index f8b61e641..8a561b8d4 100644 --- a/docs/src/query/restrict.md +++ b/docs/src/query/restrict.md @@ -1,205 +1,338 @@ # Restriction -## Restriction operators `&` and `-` +Restriction selects entities from a table that satisfy specific conditions. +It's the most frequently used query operator in DataJoint. -The restriction operator `A & cond` selects the subset of entities from `A` that meet -the condition `cond`. -The exclusion operator `A - cond` selects the complement of restriction, i.e. the -subset of entities from `A` that do not meet the condition `cond`. +## Basic Syntax -Restriction and exclusion. +```python +# Restriction (inclusion): select matching entities +result = Table & condition + +# Exclusion: select non-matching entities +result = Table - condition +``` ![Restriction and exclusion](../images/op-restrict.png){: style="width:400px; align:center"} -The condition `cond` may be one of the following: +## Condition Types + +### Dictionary Conditions + +Dictionaries specify exact equality matches: + +```python +# Single attribute match +Session & {'subject_id': 1} + +# Multiple attribute match (AND) +Session & {'subject_id': 1, 'session_date': '2024-01-15'} + +# Primary key lookup (returns at most one entity) +subject = (Subject & {'subject_id': 1}).fetch1() +``` + +**Note**: Unmatched dictionary keys are silently ignored: + +```python +# Typo in key name - returns ALL entities (no filter applied) +Session & {'sesion_date': '2024-01-15'} # 's' missing +``` + +### String Conditions + +Strings allow SQL-like expressions: + +```python +# Equality +Session & 'user = "Alice"' + +# Inequality +Experiment & 'duration >= 60' + +# Range +Subject & 'date_of_birth BETWEEN "2023-01-01" AND "2023-12-31"' + +# Pattern matching +Subject & 'species LIKE "mouse%"' + +# NULL checks +Session & 'notes IS NOT NULL' -+ another table -+ a mapping, e.g. `dict` -+ an expression in a character string -+ a collection of conditions as a `list`, `tuple`, or Pandas `DataFrame` -+ a Boolean expression (`True` or `False`) -+ an `AndList` -+ a `Not` object -+ a query expression +# Arithmetic +Trial & 'end_time - start_time > 10' +``` + +### Table Conditions (Semijoins) -As the restriction and exclusion operators are complementary, queries can be -constructed using both operators that will return the same results. -For example, the queries `A & cond` and `A - Not(cond)` will return the same entities. +Restrict by related entities in another table: -## Restriction by a table +```python +# Sessions that have at least one trial +Session & Trial -When restricting table `A` with another table, written `A & B`, the two tables must be -**join-compatible** (see `join-compatible` in [Operators](./operators.md)). -The result will contain all entities from `A` for which there exist a matching entity -in `B`. -Exclusion of table `A` with table `B`, or `A - B`, will contain all entities from `A` -for which there are no matching entities in `B`. +# Sessions with no trials +Session - Trial -Restriction by another table. +# Subjects that have sessions +Subject & Session + +# Subjects with no sessions +Subject - Session +``` ![Restriction by another table](../images/restrict-example1.png){: style="width:546px; align:center"} -Exclusion by another table. +### Query Conditions -![Exclusion by another table](../images/diff-example1.png){: style="width:539px; align:center"} +Use query expressions as conditions: -### Restriction by a table with no common attributes +```python +# Sessions by Alice +alice_sessions = Session & 'user = "Alice"' -Restriction of table `A` with another table `B` having none of the same attributes as -`A` will simply return all entities in `A`, unless `B` is empty as described below. -Exclusion of table `A` with `B` having no common attributes will return no entities, -unless `B` is empty as described below. +# Experiments in Alice's sessions +Experiment & alice_sessions -Restriction by a table having no common attributes. +# Trials from experiments longer than 60 seconds +long_experiments = Experiment & 'duration >= 60' +Trial & long_experiments +``` -![Restriction by a table with no common attributes](../images/restrict-example2.png){: style="width:571px; align:center"} +## Combining Conditions -Exclusion by a table having no common attributes. +### AND Logic (Chain Restrictions) -![Exclusion by a table having no common attributes](../images/diff-example2.png){: style="width:571px; align:center"} +```python +# Multiple conditions combined with AND +Session & 'user = "Alice"' & 'session_date > "2024-01-01"' + +# Equivalent using AndList +Session & dj.AndList([ + 'user = "Alice"', + 'session_date > "2024-01-01"' +]) +``` + +### OR Logic (List/Tuple) + +```python +# Entities matching ANY condition (OR) +Subject & ['subject_id = 1', 'subject_id = 2', 'subject_id = 3'] + +# Multiple users +Session & ['user = "Alice"', 'user = "Bob"'] -### Restriction by an empty table +# Equivalent using tuple +Session & ('user = "Alice"', 'user = "Bob"') +``` -Restriction of table `A` with an empty table will return no entities regardless of -whether there are any matching attributes. -Exclusion of table `A` with an empty table will return all entities in `A`. +### NOT Logic -Restriction by an empty table. +```python +# Exclusion operator +Session - 'user = "Alice"' # Sessions NOT by Alice -![Restriction by an empty table](../images/restrict-example3.png){: style="width:563px; align:center"} +# Not object +Session & dj.Not('user = "Alice"') # Same result +``` -Exclusion by an empty table. +### Complex Combinations -![Exclusion by an empty table](../images/diff-example3.png){: style="width:571px; align:center"} +```python +# (Alice's sessions) OR (sessions after 2024) +(Session & 'user = "Alice"') + (Session & 'session_date > "2024-01-01"') -## Restriction by a mapping +# Alice's sessions that are NOT in 2024 +(Session & 'user = "Alice"') - 'session_date > "2024-01-01"' -A key-value mapping may be used as an operand in restriction. -For each key that is an attribute in `A`, the paired value is treated as part of an -equality condition. -Any key-value pairs without corresponding attributes in `A` are ignored. +# Sessions with trials but no experiments +(Session & Trial) - Experiment +``` -Restriction by an empty mapping or by a mapping with no keys matching the attributes in -`A` will return all the entities in `A`. -Exclusion by an empty mapping or by a mapping with no matches will return no entities. +## Practical Examples -For example, let's say that table `Session` has the attribute `session_date` of -[datatype](../design/tables/attributes.md) `datetime`. -You are interested in sessions from January 1st, 2018, so you write the following -restriction query using a mapping. +### Filter by Primary Key ```python -Session & {'session_date': "2018-01-01"} +# Fetch specific subject +subject = (Subject & {'subject_id': 5}).fetch1() + +# Fetch multiple specific subjects +subjects = (Subject & [{'subject_id': 1}, {'subject_id': 2}]).fetch() ``` -Our mapping contains a typo omitting the final `e` from `session_date`, so no keys in -our mapping will match any attribute in `Session`. -As such, our query will return all of the entities of `Session`. +### Filter by Date Range + +```python +# Sessions in January 2024 +jan_sessions = Session & 'session_date BETWEEN "2024-01-01" AND "2024-01-31"' -## Restriction by a string +# Sessions in the last 30 days +recent = Session & 'session_date >= CURDATE() - INTERVAL 30 DAY' +``` -Restriction can be performed when `cond` is an explicit condition on attribute values, -expressed as a string. -Such conditions may include arithmetic operations, functions, range tests, etc. -Restriction of table `A` by a string containing an attribute not found in table `A` -produces an error. +### Filter by Related Data ```python -# All the sessions performed by Alice -Session & 'user = "Alice"' +# Subjects with at least 5 sessions +active_subjects = Subject & ( + Subject.aggr(Session, n='count(*)') & 'n >= 5' +).proj() -# All the experiments at least one minute long -Experiment & 'duration >= 60' +# Sessions with successful trials +successful_sessions = Session & (Trial & 'success = 1') + +# Experiments with all trials complete +complete_experiments = Experiment - (Trial & 'status != "complete"') +``` + +### Filter by Computed Values + +```python +# Trials longer than average +avg_duration = Trial.proj().aggr(Trial, avg='avg(duration)').fetch1('avg') +long_trials = Trial & f'duration > {avg_duration}' + +# Sessions with above-average trial count +Session & ( + Session.aggr(Trial, n='count(*)') & + f'n > {len(Trial) / len(Session)}' +).proj() ``` -## Restriction by a collection +## Query Patterns -A collection can be a list, a tuple, or a Pandas `DataFrame`. +### Existence Check ```python -# a list: -cond_list = ['first_name = "Aaron"', 'last_name = "Aaronson"'] +# Does subject 1 exist? +if Subject & {'subject_id': 1}: + print("Subject exists") -# a tuple: -cond_tuple = ('first_name = "Aaron"', 'last_name = "Aaronson"') +# Are there any sessions today? +if Session & f'session_date = "{date.today()}"': + print("Sessions recorded today") +``` + +### Find Missing Data + +```python +# Subjects without sessions +orphan_subjects = Subject - Session + +# Sessions without trials +empty_sessions = Session - Trial + +# Experiments missing analysis +unanalyzed = Experiment - Analysis +``` + +### Universal Quantification + +```python +# Subjects where ALL sessions are complete +# (subjects with no incomplete sessions) +complete_subjects = Subject - (Session - 'status = "complete"') -# a dataframe: -import pandas as pd -cond_frame = pd.DataFrame( - data={'first_name': ['Aaron'], 'last_name': ['Aaronson']}) +# Experiments where ALL trials succeeded +successful_experiments = Experiment - (Trial - 'success = 1') ``` -When `cond` is a collection of conditions, the conditions are applied by logical -disjunction (logical OR). -Thus, restriction of table `A` by a collection will return all entities in `A` that -meet *any* of the conditions in the collection. -For example, if you restrict the `Student` table by a collection containing two -conditions, one for a first and one for a last name, your query will return any -students with a matching first name *or* a matching last name. +### Find Related Entities ```python -Student() & ['first_name = "Aaron"', 'last_name = "Aaronson"'] +# All sessions for a specific subject +subject_sessions = Session & (Subject & {'subject_id': 1}) + +# All trials across all sessions for a subject +subject_trials = Trial & (Session & {'subject_id': 1}) ``` -Restriction by a collection, returning all entities matching any condition in the collection. +## Special Restrictions -![Restriction by collection](../images/python_collection.png){: style="align:center"} +### dj.Top -Restriction by an empty collection returns no entities. -Exclusion of table `A` by an empty collection returns all the entities of `A`. +Limit results with optional ordering: -## Restriction by a Boolean expression +```python +# First 10 sessions by date +Session & dj.Top(limit=10, order_by='session_date') -`A & True` and `A - False` are equivalent to `A`. +# Latest 5 sessions +Session & dj.Top(limit=5, order_by='session_date DESC') -`A & False` and `A - True` are empty. +# Pagination: skip first 10, get next 10 +Session & dj.Top(limit=10, offset=10, order_by='session_date') +``` -## Restriction by an `AndList` +### Boolean Values -The special function `dj.AndList` represents logical conjunction (logical AND). -Restriction of table `A` by an `AndList` will return all entities in `A` that meet -*all* of the conditions in the list. -`A & dj.AndList([c1, c2, c3])` is equivalent to `A & c1 & c2 & c3`. -Usually, it is more convenient to simply write out all of the conditions, as -`A & c1 & c2 & c3`. -However, when a list of conditions has already been generated, the list can simply be -passed as the argument to `dj.AndList`. +```python +# True: returns all entities +Session & True # Same as Session -Restriction of table `A` by an empty `AndList`, as in `A & dj.AndList([])`, will return -all of the entities in `A`. -Exclusion by an empty `AndList` will return no entities. +# False: returns no entities +Session & False # Empty result +``` -## Restriction by a `Not` object +### Empty Conditions -The special function `dj.Not` represents logical negation, such that `A & dj.Not(cond)` -is equivalent to `A - cond`. +```python +# Empty dict: returns all entities +Session & {} # Same as Session -## Restriction by a query +# Empty list: returns no entities +Session & [] # Empty result -Restriction by a query object is a generalization of restriction by a table (which is -also a query object), because DataJoint queries always produce well-defined entity -sets, as described in [entity normalization](../design/normalization.md). -As such, restriction by queries follows the same behavior as restriction by tables -described above. +# Empty AndList: returns all entities +Session & dj.AndList([]) # Same as Session +``` + +## Performance Tips -The example below creates a query object corresponding to all the sessions performed by -the user Alice. -The `Experiment` table is then restricted by the query object, returning all the -experiments that are part of sessions performed by Alice. +1. **Primary key restrictions are fastest**: Use when possible +2. **Indexed attributes**: Restrictions on indexed columns are faster +3. **Chain restrictions**: `A & cond1 & cond2` is often faster than complex strings +4. **Avoid fetching then filtering**: Filter in the query, not in Python ```python -query = Session & 'user = "Alice"' -Experiment & query +# Good: filter in query +results = (Session & 'session_date > "2024-01-01"').fetch() + +# Bad: filter after fetch +all_sessions = Session.fetch(as_dict=True) +results = [s for s in all_sessions if s['session_date'] > date(2024, 1, 1)] ``` -## Restriction by `dj.Top` +## Common Mistakes -Restriction by `dj.Top` returns the number of entities specified by the `limit` -argument. These entities can be returned in the order specified by the `order_by` -argument. And finally, the `offset` argument can be used to offset the returned entities -which is useful for pagination in web applications. +### Typos in Dictionary Keys ```python -# Return the first 10 sessions in descending order of session date -Session & dj.Top(limit=10, order_by='session_date DESC') +# Wrong: key doesn't match, returns ALL rows +Session & {'sesion_date': '2024-01-01'} + +# Right: correct spelling +Session & {'session_date': '2024-01-01'} +``` + +### Quoting in String Conditions + +```python +# Wrong: missing quotes around string value +Session & 'user = Alice' + +# Right: quoted string value +Session & 'user = "Alice"' +``` + +### List vs AndList + +```python +# List = OR (any match) +Session & ['user = "Alice"', 'user = "Bob"'] # Alice OR Bob + +# AndList = AND (all must match) +Session & dj.AndList(['session_date > "2024-01-01"', 'user = "Alice"']) ``` diff --git a/docs/src/query/union.md b/docs/src/query/union.md index 71f0fa687..184ab3ec9 100644 --- a/docs/src/query/union.md +++ b/docs/src/query/union.md @@ -7,7 +7,7 @@ Union is rarely needed in practice. ## Union operator `+` The result of the union operator `A + B` contains all the entities from both operands. -[Entity normalization](../design/normalization.md) requires that the operands in a +Entity normalization requires that the operands in a union both belong to the same entity type with the same primary key using homologous attributes. In the absence of any secondary attributes, the result of a union is the simple set union. diff --git a/docs/src/quick-start.md b/docs/src/quick-start.md index 17f783405..b28ca7144 100644 --- a/docs/src/quick-start.md +++ b/docs/src/quick-start.md @@ -319,7 +319,7 @@ Area.populate(display_progress=True) ``` The `make` method populates automated tables from inserted data. Read more in the -full article [here](./compute/make.md) +full article [here](./operations/make.md) ## Query diff --git a/docs/src/citation.md b/docs/src/reference/citation.md similarity index 100% rename from docs/src/citation.md rename to docs/src/reference/citation.md diff --git a/docs/src/develop.md b/docs/src/reference/develop.md similarity index 100% rename from docs/src/develop.md rename to docs/src/reference/develop.md diff --git a/docs/src/faq.md b/docs/src/reference/faq.md similarity index 100% rename from docs/src/faq.md rename to docs/src/reference/faq.md diff --git a/docs/src/publish-data.md b/docs/src/reference/publish-data.md similarity index 100% rename from docs/src/publish-data.md rename to docs/src/reference/publish-data.md diff --git a/docs/src/internal/transpilation.md b/docs/src/reference/transpilation.md similarity index 100% rename from docs/src/internal/transpilation.md rename to docs/src/reference/transpilation.md diff --git a/pyproject.toml b/pyproject.toml index dc151d7cf..8d27481eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,14 @@ dependencies = [ "networkx", "pydot", "minio>=7.0.0", + "fsspec>=2023.1.0", "matplotlib", "faker", "urllib3", "setuptools", "pydantic-settings>=2.0.0", ] + requires-python = ">=3.10,<3.14" authors = [ {name = "Dimitri Yatsenko", email = "dimitri@datajoint.com"}, @@ -90,6 +92,9 @@ test = [ ] [project.optional-dependencies] +s3 = ["s3fs>=2023.1.0"] +gcs = ["gcsfs>=2023.1.0"] +azure = ["adlfs>=2023.1.0"] dev = [ "pre-commit", "ruff", diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 0f8123c66..405134630 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,18 +45,25 @@ "kill", "MatCell", "MatStruct", - "AttributeAdapter", + "AttributeType", + "register_type", + "list_types", + "AttributeAdapter", # Deprecated, use AttributeType "errors", + "migrate", "DataJointError", "key", "key_hash", "logger", "cli", + "ObjectRef", ] from . import errors +from . import migrate from .admin import kill from .attribute_adapter import AttributeAdapter +from .attribute_type import AttributeType, list_types, register_type from .blob import MatCell, MatStruct from .cli import cli from .connection import Connection, conn @@ -66,6 +73,7 @@ from .fetch import key from .hash import key_hash from .logging import logger +from .objectref import ObjectRef from .schemas import Schema, VirtualModule, list_schemas from .settings import config from .table import FreeTable, Table diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index 12a34f27e..7df566a58 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -1,61 +1,211 @@ +""" +Legacy attribute adapter module. + +This module provides backward compatibility for the deprecated AttributeAdapter class. +New code should use :class:`datajoint.AttributeType` instead. + +.. deprecated:: 0.15 + Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods. +""" + import re +import warnings +from typing import Any + +from .attribute_type import AttributeType, get_type, is_type_registered +from .errors import DataJointError -from .errors import DataJointError, _support_adapted_types +# Pattern to detect blob types for internal pack/unpack +_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I) -class AttributeAdapter: +class AttributeAdapter(AttributeType): """ - Base class for adapter objects for user-defined attribute types. + Legacy base class for attribute adapters. + + .. deprecated:: 0.15 + Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods instead. + + This class provides backward compatibility for existing adapters that use + the ``attribute_type``, ``put()``, and ``get()`` API. + + Migration guide:: + + # Old style (deprecated): + class GraphAdapter(dj.AttributeAdapter): + attribute_type = "longblob" + + def put(self, graph): + return list(graph.edges) + + def get(self, edges): + return nx.Graph(edges) + + # New style (recommended): + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return list(graph.edges) + + def decode(self, edges, *, key=None): + return nx.Graph(edges) """ + # Subclasses can set this as a class attribute instead of property + attribute_type: str = None # type: ignore + + def __init__(self): + # Emit deprecation warning on instantiation + warnings.warn( + f"{self.__class__.__name__} uses the deprecated AttributeAdapter API. " + "Migrate to AttributeType with encode/decode methods.", + DeprecationWarning, + stacklevel=2, + ) + @property - def attribute_type(self): + def type_name(self) -> str: """ - :return: a supported DataJoint attribute type to use; e.g. "longblob", "blob@store" + Infer type name from class name for legacy adapters. + + Legacy adapters were identified by their variable name in the context dict, + not by a property. For backward compatibility, we use the lowercase class name. + """ + # Check if a _type_name was explicitly set (for context-based lookup) + if hasattr(self, "_type_name"): + return self._type_name + # Fall back to class name + return self.__class__.__name__.lower() + + @property + def dtype(self) -> str: + """Map legacy attribute_type to new dtype property.""" + attr_type = self.attribute_type + if attr_type is None: + raise NotImplementedError( + f"{self.__class__.__name__} must define 'attribute_type' " "(or migrate to AttributeType with 'dtype')" + ) + return attr_type + + def _is_blob_dtype(self) -> bool: + """Check if dtype is a blob type requiring pack/unpack.""" + return bool(_BLOB_PATTERN.match(self.dtype)) + + def encode(self, value: Any, *, key: dict | None = None) -> Any: """ - raise NotImplementedError("Undefined attribute adapter") + Delegate to legacy put() method, with blob packing if needed. - def get(self, value): + Legacy adapters expect blob.pack to be called after put() when + the dtype is a blob type. This wrapper handles that automatically. """ - convert value retrieved from the the attribute in a table into the adapted type + result = self.put(value) + # Legacy adapters expect blob.pack after put() for blob dtypes + if self._is_blob_dtype(): + from . import blob - :param value: value from the database + result = blob.pack(result) + return result - :return: object of the adapted type + def decode(self, stored: Any, *, key: dict | None = None) -> Any: """ - raise NotImplementedError("Undefined attribute adapter") + Delegate to legacy get() method, with blob unpacking if needed. - def put(self, obj): + Legacy adapters expect blob.unpack to be called before get() when + the dtype is a blob type. This wrapper handles that automatically. """ - convert an object of the adapted type into a value that DataJoint can store in a table attribute + # Legacy adapters expect blob.unpack before get() for blob dtypes + if self._is_blob_dtype(): + from . import blob + + stored = blob.unpack(stored) + return self.get(stored) - :param obj: an object of the adapted type - :return: value to store in the database + def put(self, obj: Any) -> Any: """ - raise NotImplementedError("Undefined attribute adapter") + Convert an object of the adapted type into a storable value. + + .. deprecated:: 0.15 + Override ``encode()`` instead. + Args: + obj: An object of the adapted type. -def get_adapter(context, adapter_name): + Returns: + Value to store in the database. + """ + raise NotImplementedError(f"{self.__class__.__name__} must implement put() or migrate to encode()") + + def get(self, value: Any) -> Any: + """ + Convert a value from the database into the adapted type. + + .. deprecated:: 0.15 + Override ``decode()`` instead. + + Args: + value: Value from the database. + + Returns: + Object of the adapted type. + """ + raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()") + + +def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: """ - Extract the AttributeAdapter object by its name from the context and validate. + Get an attribute type/adapter by name. + + This function provides backward compatibility by checking both: + 1. The global type registry (new system) + 2. The schema context dict (legacy system) + + Args: + context: Schema context dictionary (for legacy adapters). + adapter_name: The adapter/type name, with or without angle brackets. + + Returns: + The AttributeType instance. + + Raises: + DataJointError: If the adapter is not found or invalid. """ - if not _support_adapted_types(): - raise DataJointError("Support for Adapted Attribute types is disabled.") adapter_name = adapter_name.lstrip("<").rstrip(">") + + # First, check the global type registry (new system) + if is_type_registered(adapter_name): + return get_type(adapter_name) + + # Fall back to context-based lookup (legacy system) + if context is None: + raise DataJointError( + f"Attribute type <{adapter_name}> is not registered. " "Use @dj.register_type to register custom types." + ) + try: adapter = context[adapter_name] except KeyError: - raise DataJointError("Attribute adapter '{adapter_name}' is not defined.".format(adapter_name=adapter_name)) - if not isinstance(adapter, AttributeAdapter): raise DataJointError( - "Attribute adapter '{adapter_name}' must be an instance of datajoint.AttributeAdapter".format( - adapter_name=adapter_name - ) + f"Attribute type <{adapter_name}> is not defined. " + "Register it with @dj.register_type or include it in the schema context." ) - if not isinstance(adapter.attribute_type, str) or not re.match(r"^\w", adapter.attribute_type): + + # Validate it's an AttributeType (or legacy AttributeAdapter) + if not isinstance(adapter, AttributeType): raise DataJointError( - "Invalid attribute type {type} in attribute adapter '{adapter_name}'".format( - type=adapter.attribute_type, adapter_name=adapter_name - ) + f"Attribute adapter '{adapter_name}' must be an instance of " + "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)" ) + + # For legacy adapters from context, store the name they were looked up by + if isinstance(adapter, AttributeAdapter): + adapter._type_name = adapter_name + + # Validate the dtype/attribute_type + dtype = adapter.dtype + if not isinstance(dtype, str) or not re.match(r"^\w", dtype): + raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>") + return adapter diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py new file mode 100644 index 000000000..9be2d2214 --- /dev/null +++ b/src/datajoint/attribute_type.py @@ -0,0 +1,531 @@ +""" +Custom attribute type system for DataJoint. + +This module provides the AttributeType base class and registration mechanism +for creating custom data types that extend DataJoint's native type system. + +Custom types enable seamless integration of complex Python objects (like NumPy arrays, +graphs, or domain-specific structures) with DataJoint's relational storage. + +Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph: nx.Graph) -> list: + return list(graph.edges) + + def decode(self, edges: list) -> nx.Graph: + return nx.Graph(edges) + + # Then use in table definitions: + class MyTable(dj.Manual): + definition = ''' + id : int + --- + data : + ''' +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from .errors import DataJointError + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__.split(".")[0]) + +# Global type registry - maps type_name to AttributeType instance +_type_registry: dict[str, AttributeType] = {} +_entry_points_loaded: bool = False + + +class AttributeType(ABC): + """ + Base class for custom DataJoint attribute types. + + Subclass this to create custom types that can be used in table definitions + with the ```` syntax. Custom types define bidirectional conversion + between Python objects and DataJoint's storage format. + + Attributes: + type_name: Unique identifier used in ```` syntax + dtype: Underlying DataJoint storage type + + Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph): + return list(graph.edges) + + def decode(self, edges): + import networkx as nx + return nx.Graph(edges) + + The type can then be used in table definitions:: + + class Connectivity(dj.Manual): + definition = ''' + id : int + --- + graph_data : + ''' + """ + + @property + @abstractmethod + def type_name(self) -> str: + """ + Unique identifier for this type, used in table definitions as ````. + + This name must be unique across all registered types. It should be lowercase + with underscores (e.g., "graph", "zarr_array", "compressed_image"). + + Returns: + The type name string without angle brackets. + """ + ... + + @property + @abstractmethod + def dtype(self) -> str: + """ + The underlying DataJoint type used for storage. + + Can be: + - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"`` + - An external type: ``"blob@store"``, ``"attach@store"`` + - The object type: ``"object"`` + - Another custom type: ``""`` (enables type chaining) + + Returns: + The storage type specification string. + """ + ... + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None) -> Any: + """ + Convert a Python object to the storable format. + + Called during INSERT operations to transform user-provided objects + into a format suitable for storage in the underlying ``dtype``. + + Args: + value: The Python object to store. + key: Primary key values as a dict. Available when the dtype uses + object storage and may be needed for path construction. + + Returns: + Value in the format expected by ``dtype``. For example: + - For ``dtype="longblob"``: any picklable Python object + - For ``dtype="object"``: path string or file-like object + - For ``dtype="varchar(N)"``: string + """ + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Convert stored data back to a Python object. + + Called during FETCH operations to reconstruct the original Python + object from the stored format. + + Args: + stored: Data retrieved from storage. Type depends on ``dtype``: + - For ``"object"``: an ``ObjectRef`` handle + - For blob types: the unpacked Python object + - For native types: the native Python value (str, int, etc.) + key: Primary key values as a dict. + + Returns: + The reconstructed Python object. + """ + ... + + def validate(self, value: Any) -> None: + """ + Validate a value before encoding. + + Override this method to add type checking or domain constraints. + Called automatically before ``encode()`` during INSERT operations. + The default implementation accepts any value. + + Args: + value: The value to validate. + + Raises: + TypeError: If the value has an incompatible type. + ValueError: If the value fails domain validation. + """ + pass + + def default(self) -> Any: + """ + Return a default value for this type. + + Override if the type has a sensible default value. The default + implementation raises NotImplementedError, indicating no default exists. + + Returns: + The default value for this type. + + Raises: + NotImplementedError: If no default exists (the default behavior). + """ + raise NotImplementedError(f"No default value for type <{self.type_name}>") + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>" + + +def register_type(cls: type[AttributeType]) -> type[AttributeType]: + """ + Register a custom attribute type with DataJoint. + + Can be used as a decorator or called directly. The type becomes available + for use in table definitions with the ```` syntax. + + Args: + cls: An AttributeType subclass to register. + + Returns: + The same class, unmodified (allows use as decorator). + + Raises: + DataJointError: If a type with the same name is already registered + by a different class. + TypeError: If cls is not an AttributeType subclass. + + Example: + As a decorator:: + + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + ... + + Or called directly:: + + dj.register_type(GraphType) + """ + if not isinstance(cls, type) or not issubclass(cls, AttributeType): + raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}") + + instance = cls() + name = instance.type_name + + if not isinstance(name, str) or not name: + raise DataJointError(f"type_name must be a non-empty string, got {name!r}") + + if name in _type_registry: + existing = _type_registry[name] + if type(existing) is not cls: + raise DataJointError( + f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}" + ) + # Same class registered twice - idempotent, no error + return cls + + _type_registry[name] = instance + logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}") + return cls + + +def unregister_type(name: str) -> None: + """ + Remove a type from the registry. + + Primarily useful for testing. Use with caution in production code. + + Args: + name: The type_name to unregister. + + Raises: + DataJointError: If the type is not registered. + """ + name = name.strip("<>") + if name not in _type_registry: + raise DataJointError(f"Type <{name}> is not registered") + del _type_registry[name] + + +def get_type(name: str) -> AttributeType: + """ + Retrieve a registered attribute type by name. + + Looks up the type in the explicit registry first, then attempts + to load from installed packages via entry points. + + Args: + name: The type name, with or without angle brackets. + + Returns: + The registered AttributeType instance. + + Raises: + DataJointError: If the type is not found. + """ + name = name.strip("<>") + + # Check explicit registry first + if name in _type_registry: + return _type_registry[name] + + # Lazy-load entry points + _load_entry_points() + + if name in _type_registry: + return _type_registry[name] + + raise DataJointError( + f"Unknown attribute type: <{name}>. " f"Ensure the type is registered via @dj.register_type or installed as a package." + ) + + +def list_types() -> list[str]: + """ + List all registered type names. + + Returns: + Sorted list of registered type names. + """ + _load_entry_points() + return sorted(_type_registry.keys()) + + +def is_type_registered(name: str) -> bool: + """ + Check if a type name is registered. + + Args: + name: The type name to check. + + Returns: + True if the type is registered. + """ + name = name.strip("<>") + if name in _type_registry: + return True + _load_entry_points() + return name in _type_registry + + +def _load_entry_points() -> None: + """ + Load attribute types from installed packages via entry points. + + Types are discovered from the ``datajoint.types`` entry point group. + Packages declare types in pyproject.toml:: + + [project.entry-points."datajoint.types"] + zarr_array = "dj_zarr:ZarrArrayType" + + This function is idempotent - entry points are only loaded once. + """ + global _entry_points_loaded + if _entry_points_loaded: + return + + _entry_points_loaded = True + + try: + from importlib.metadata import entry_points + except ImportError: + # Python < 3.10 fallback + try: + from importlib_metadata import entry_points + except ImportError: + logger.debug("importlib.metadata not available, skipping entry point discovery") + return + + try: + # Python 3.10+ / importlib_metadata 3.6+ + eps = entry_points(group="datajoint.types") + except TypeError: + # Older API + eps = entry_points().get("datajoint.types", []) + + for ep in eps: + if ep.name in _type_registry: + # Already registered explicitly, skip entry point + continue + try: + type_class = ep.load() + register_type(type_class) + logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}") + except Exception as e: + logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") + + +def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[AttributeType]]: + """ + Resolve a dtype string, following type chains. + + If dtype references another custom type (e.g., ""), recursively + resolves to find the ultimate storage type. + + Args: + dtype: The dtype string to resolve. + seen: Set of already-seen type names (for cycle detection). + + Returns: + Tuple of (final_storage_type, list_of_types_in_chain). + The chain is ordered from outermost to innermost type. + + Raises: + DataJointError: If a circular type reference is detected. + """ + if seen is None: + seen = set() + + chain: list[AttributeType] = [] + + # Check if dtype is a custom type reference + if dtype.startswith("<") and dtype.endswith(">"): + type_name = dtype[1:-1] + + if type_name in seen: + raise DataJointError(f"Circular type reference detected: <{type_name}>") + + seen.add(type_name) + attr_type = get_type(type_name) + chain.append(attr_type) + + # Recursively resolve the inner dtype + inner_dtype, inner_chain = resolve_dtype(attr_type.dtype, seen) + chain.extend(inner_chain) + return inner_dtype, chain + + # Not a custom type - return as-is + return dtype, chain + + +# ============================================================================= +# Built-in Attribute Types +# ============================================================================= + + +class DJBlobType(AttributeType): + """ + Built-in type for DataJoint's native serialization format. + + This type handles serialization of arbitrary Python objects (including NumPy arrays, + dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes: + + - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) + - Optional compression (zlib) + - Support for NumPy arrays, datetime objects, UUIDs, and nested structures + + The ```` type is the explicit way to specify DataJoint's serialization. + It stores data in a MySQL ``LONGBLOB`` column. + + Example: + @schema + class ProcessedData(dj.Manual): + definition = ''' + data_id : int + --- + results : # Serialized Python objects + raw_bytes : longblob # Raw bytes (no serialization) + ''' + + Note: + Plain ``longblob`` columns store and return raw bytes without serialization. + Use ```` when you need automatic serialization of Python objects. + Existing schemas using implicit blob serialization should migrate to ```` + using ``dj.migrate.migrate_blob_columns()``. + """ + + type_name = "djblob" + dtype = "longblob" + + def encode(self, value: Any, *, key: dict | None = None) -> bytes: + """ + Serialize a Python object to DataJoint's blob format. + + Args: + value: Any serializable Python object (dict, list, numpy array, etc.) + key: Primary key values (unused for blob serialization). + + Returns: + Serialized bytes with protocol header and optional compression. + """ + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """ + Deserialize DataJoint blob format back to a Python object. + + Args: + stored: Serialized blob bytes. + key: Primary key values (unused for blob serialization). + + Returns: + The deserialized Python object. + """ + from . import blob + + return blob.unpack(stored, squeeze=False) + + +class DJBlobExternalType(AttributeType): + """ + Built-in type for externally-stored DataJoint blobs. + + Similar to ```` but stores data in external blob storage instead + of inline in the database. Useful for large objects. + + The store name is specified when defining the column type. + + Example: + @schema + class LargeData(dj.Manual): + definition = ''' + data_id : int + --- + large_array : blob@mystore # External storage with auto-serialization + ''' + """ + + # Note: This type isn't directly usable via syntax + # It's used internally when blob@store syntax is detected + type_name = "djblob_external" + dtype = "blob@store" # Placeholder - actual store is determined at declaration time + + def encode(self, value: Any, *, key: dict | None = None) -> bytes: + """Serialize a Python object to DataJoint's blob format.""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize DataJoint blob format back to a Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) + + +def _register_builtin_types() -> None: + """ + Register DataJoint's built-in attribute types. + + Called automatically during module initialization. + """ + register_type(DJBlobType) + + +# Register built-in types when module is loaded +_register_builtin_types() diff --git a/src/datajoint/autopopulate.py b/src/datajoint/autopopulate.py index 677a8113c..c90116a74 100644 --- a/src/datajoint/autopopulate.py +++ b/src/datajoint/autopopulate.py @@ -5,7 +5,6 @@ import inspect import logging import multiprocessing as mp -import random import signal import traceback @@ -13,8 +12,7 @@ from tqdm import tqdm from .errors import DataJointError, LostConnectionError -from .expression import AndList, QueryExpression -from .hash import key_hash +from .expression import AndList # noinspection PyExceptionInherit,PyCallingNonCallable @@ -55,6 +53,7 @@ class AutoPopulate: _key_source = None _allow_insert = False + _jobs_table = None # Cached JobsTable instance @property def key_source(self): @@ -74,7 +73,7 @@ def _rename_attributes(table, props): ) if self._key_source is None: - parents = self.target.parents(primary=True, as_objects=True, foreign_key_info=True) + parents = self.parents(primary=True, as_objects=True, foreign_key_info=True) if not parents: raise DataJointError("A table must have dependencies from its primary key for auto-populate to work") self._key_source = _rename_attributes(*parents[0]) @@ -152,49 +151,20 @@ def make(self, key): yield @property - def target(self): + def jobs(self): """ - :return: table to be populated. - In the typical case, dj.AutoPopulate is mixed into a dj.Table class by - inheritance and the target is self. - """ - return self + Access the jobs table for this auto-populated table. - def _job_key(self, key): - """ - :param key: they key returned for the job from the key source - :return: the dict to use to generate the job reservation hash - This method allows subclasses to control the job reservation granularity. - """ - return key + The jobs table provides per-table job queue management with rich status + tracking (pending, reserved, success, error, ignore). - def _jobs_to_do(self, restrictions): - """ - :return: the query yielding the keys to be computed (derived from self.key_source) + :return: JobsTable instance for this table """ - if self.restriction: - raise DataJointError( - "Cannot call populate on a restricted table. Instead, pass conditions to populate() as arguments." - ) - todo = self.key_source + if self._jobs_table is None: + from .jobs import JobsTable - # key_source is a QueryExpression subclass -- trigger instantiation - if inspect.isclass(todo) and issubclass(todo, QueryExpression): - todo = todo() - - if not isinstance(todo, QueryExpression): - raise DataJointError("Invalid key_source value") - - try: - # check if target lacks any attributes from the primary key of key_source - raise DataJointError( - "The populate target lacks attribute %s " - "from the primary key of key_source" - % next(name for name in todo.heading.primary_key if name not in self.target.heading) - ) - except StopIteration: - pass - return (todo & AndList(restrictions)).proj() + self._jobs_table = JobsTable(self) + return self._jobs_table def populate( self, @@ -203,12 +173,12 @@ def populate( suppress_errors=False, return_exception_objects=False, reserve_jobs=False, - order="original", - limit=None, max_calls=None, display_progress=False, processes=1, make_kwargs=None, + priority=None, + refresh=True, ): """ ``table.populate()`` calls ``table.make(key)`` for every primary key in @@ -221,8 +191,6 @@ def populate( :param suppress_errors: if True, do not terminate execution. :param return_exception_objects: return error objects instead of just error messages :param reserve_jobs: if True, reserve jobs to populate in asynchronous fashion - :param order: "original"|"reverse"|"random" - the order of execution - :param limit: if not None, check at most this many keys :param max_calls: if not None, populate at most this many keys :param display_progress: if True, report progress_bar :param processes: number of processes to use. Set to None to use all cores @@ -230,6 +198,10 @@ def populate( to be passed down to each ``make()`` call. Computation arguments should be specified within the pipeline e.g. using a `dj.Lookup` table. :type make_kwargs: dict, optional + :param priority: Only process jobs at this priority or more urgent (lower values). + Only applies when reserve_jobs=True. + :param refresh: If True and no pending jobs are found, refresh the jobs queue + before giving up. Only applies when reserve_jobs=True. :return: a dict with two keys "success_count": the count of successful ``make()`` calls in this ``populate()`` call "error_list": the error list that is filled if `suppress_errors` is True @@ -237,10 +209,10 @@ def populate( if self.connection.in_transaction: raise DataJointError("Populate cannot be called during a transaction.") - valid_order = ["original", "reverse", "random"] - if order not in valid_order: - raise DataJointError("The order argument must be one of %s" % str(valid_order)) - jobs = self.connection.schemas[self.target.database].jobs if reserve_jobs else None + if self.restriction: + raise DataJointError( + "Cannot call populate on a restricted table. " "Instead, pass conditions to populate() as arguments." + ) if reserve_jobs: # Define a signal handler for SIGTERM @@ -250,29 +222,25 @@ def handler(signum, frame): old_handler = signal.signal(signal.SIGTERM, handler) - if keys is None: - keys = (self._jobs_to_do(restrictions) - self.target).fetch("KEY", limit=limit) + error_list = [] + success_list = [] - # exclude "error", "ignore" or "reserved" jobs if reserve_jobs: - exclude_key_hashes = ( - jobs & {"table_name": self.target.table_name} & 'status in ("error", "ignore", "reserved")' - ).fetch("key_hash") - keys = [key for key in keys if key_hash(key) not in exclude_key_hashes] - - if order == "reverse": - keys.reverse() - elif order == "random": - random.shuffle(keys) + # Use jobs table for coordinated processing + keys = self.jobs.fetch_pending(limit=max_calls, priority=priority) + if not keys and refresh: + logger.debug("No pending jobs found, refreshing jobs queue") + self.jobs.refresh(*restrictions) + keys = self.jobs.fetch_pending(limit=max_calls, priority=priority) + else: + # Without job reservations: compute keys directly from key_source + if keys is None: + todo = (self.key_source & AndList(restrictions)).proj() + keys = (todo - self).fetch("KEY", limit=max_calls) logger.debug("Found %d keys to populate" % len(keys)) - - keys = keys[:max_calls] nkeys = len(keys) - error_list = [] - success_list = [] - if nkeys: processes = min(_ for _ in (processes, nkeys, mp.cpu_count()) if _) @@ -282,6 +250,8 @@ def handler(signum, frame): make_kwargs=make_kwargs, ) + jobs = self.jobs if reserve_jobs else None + if processes == 1: for key in tqdm(keys, desc=self.__class__.__name__) if display_progress else keys: status = self._populate1(key, jobs, **populate_kwargs) @@ -322,46 +292,49 @@ def handler(signum, frame): def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_kwargs=None): """ populates table for one source key, calling self.make inside a transaction. - :param jobs: the jobs table or None if not reserve_jobs + :param jobs: the jobs table (JobsTable) or None if not reserve_jobs :param key: dict specifying job to populate :param suppress_errors: bool if errors should be suppressed and returned :param return_exception_objects: if True, errors must be returned as objects :return: (key, error) when suppress_errors=True, True if successfully invoke one `make()` call, otherwise False """ - # use the legacy `_make_tuples` callback. - make = self._make_tuples if hasattr(self, "_make_tuples") else self.make + import time - if jobs is not None and not jobs.reserve(self.target.table_name, self._job_key(key)): - return False + start_time = time.time() - # if make is a generator, it transaction can be delayed until the final stage - is_generator = inspect.isgeneratorfunction(make) + # Reserve the job (per-key, before make) + if jobs is not None: + jobs.reserve(key) + + # if make is a generator, transaction can be delayed until the final stage + is_generator = inspect.isgeneratorfunction(self.make) if not is_generator: self.connection.start_transaction() - if key in self.target: # already populated + if key in self: # already populated if not is_generator: self.connection.cancel_transaction() if jobs is not None: - jobs.complete(self.target.table_name, self._job_key(key)) + # Job already done - mark complete or delete + jobs.complete(key, duration=0) return False - logger.debug(f"Making {key} -> {self.target.full_table_name}") + logger.debug(f"Making {key} -> {self.full_table_name}") self.__class__._allow_insert = True try: if not is_generator: - make(dict(key), **(make_kwargs or {})) + self.make(dict(key), **(make_kwargs or {})) else: # tripartite make - transaction is delayed until the final stage - gen = make(dict(key), **(make_kwargs or {})) + gen = self.make(dict(key), **(make_kwargs or {})) fetched_data = next(gen) fetch_hash = deepdiff.DeepHash(fetched_data, ignore_iterable_order=False)[fetched_data] computed_result = next(gen) # perform the computation # fetch and insert inside a transaction self.connection.start_transaction() - gen = make(dict(key), **(make_kwargs or {})) # restart make + gen = self.make(dict(key), **(make_kwargs or {})) # restart make fetched_data = next(gen) if ( fetch_hash != deepdiff.DeepHash(fetched_data, ignore_iterable_order=False)[fetched_data] @@ -378,15 +351,25 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_ exception=error.__class__.__name__, msg=": " + str(error) if str(error) else "", ) - logger.debug(f"Error making {key} -> {self.target.full_table_name} - {error_message}") + logger.debug(f"Error making {key} -> {self.full_table_name} - {error_message}") + + # Only log errors from inside make() - not collision errors if jobs is not None: - # show error name and error message (if any) - jobs.error( - self.target.table_name, - self._job_key(key), - error_message=error_message, - error_stack=traceback.format_exc(), - ) + from .errors import DuplicateError + + if isinstance(error, DuplicateError): + # Collision error - job reverts to pending or gets deleted + # This is not a real error, just coordination artifact + logger.debug(f"Duplicate key collision for {key}, reverting job") + # Delete the reservation, letting the job be picked up again or cleaned + (jobs & key).delete_quick() + else: + # Real error inside make() - log it + jobs.error( + key, + error_message=error_message, + error_stack=traceback.format_exc(), + ) if not suppress_errors or isinstance(error, SystemExit): raise else: @@ -394,9 +377,10 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_ return key, error if return_exception_objects else error_message else: self.connection.commit_transaction() - logger.debug(f"Success making {key} -> {self.target.full_table_name}") + duration = time.time() - start_time + logger.debug(f"Success making {key} -> {self.full_table_name}") if jobs is not None: - jobs.complete(self.target.table_name, self._job_key(key)) + jobs.complete(key, duration=duration) return True finally: self.__class__._allow_insert = False @@ -406,9 +390,9 @@ def progress(self, *restrictions, display=False): Report the progress of populating the table. :return: (remaining, total) -- numbers of tuples to be populated """ - todo = self._jobs_to_do(restrictions) + todo = (self.key_source & AndList(restrictions)).proj() total = len(todo) - remaining = len(todo - self.target) + remaining = len(todo - self) if display: logger.info( "%-20s" % self.__class__.__name__ diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index f23f74a26..4aee4aa7a 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -65,6 +65,7 @@ INTERNAL_ATTACH=r"attach$", EXTERNAL_ATTACH=r"attach@(?P[a-z][\-\w]*)$", FILEPATH=r"filepath@(?P[a-z][\-\w]*)$", + OBJECT=r"object$", # managed object storage (files/folders) UUID=r"uuid$", ADAPTED=r"<.+>$", ).items() @@ -77,6 +78,7 @@ "EXTERNAL_ATTACH", "EXTERNAL_BLOB", "FILEPATH", + "OBJECT", "ADAPTED", } | set(SQL_TYPE_ALIASES) NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES @@ -465,6 +467,9 @@ def substitute_special_type(match, category, foreign_key_sql, context): match["type"] = UUID_DATA_TYPE elif category == "INTERNAL_ATTACH": match["type"] = "LONGBLOB" + elif category == "OBJECT": + # Object type stores metadata as JSON - no foreign key to external table + match["type"] = "JSON" elif category in EXTERNAL_TYPES: if category == "FILEPATH" and not _support_filepath_types(): raise DataJointError( @@ -481,8 +486,8 @@ def substitute_special_type(match, category, foreign_key_sql, context): "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match) ) elif category == "ADAPTED": - adapter = get_adapter(context, match["type"]) - match["type"] = adapter.attribute_type + attr_type = get_adapter(context, match["type"]) + match["type"] = attr_type.dtype category = match_type(match["type"]) if category in SPECIAL_TYPES: # recursive redefinition from user-defined datatypes. diff --git a/src/datajoint/external.py b/src/datajoint/external.py index 3f9efcf8e..06e76af37 100644 --- a/src/datajoint/external.py +++ b/src/datajoint/external.py @@ -1,17 +1,18 @@ import logging +import warnings from collections.abc import Mapping from pathlib import Path, PurePosixPath, PureWindowsPath from tqdm import tqdm -from . import errors, s3 from .declare import EXTERNAL_TABLE_ROOT from .errors import DataJointError, MissingExternalFile from .hash import uuid_from_buffer, uuid_from_file from .heading import Heading from .settings import config +from .storage import StorageBackend from .table import FreeTable, Table -from .utils import safe_copy, safe_write +from .utils import safe_write logger = logging.getLogger(__name__.split(".")[0]) @@ -37,8 +38,6 @@ class ExternalTable(Table): def __init__(self, connection, store, database): self.store = store - self.spec = config.get_store_spec(store) - self._s3 = None self.database = database self._connection = connection self._heading = Heading( @@ -52,9 +51,8 @@ def __init__(self, connection, store, database): self._support = [self.full_table_name] if not self.is_declared: self.declare() - self._s3 = None - if self.spec["protocol"] == "file" and not Path(self.spec["location"]).is_dir(): - raise FileNotFoundError("Inaccessible local directory %s" % self.spec["location"]) from None + # Initialize storage backend (validates configuration) + self.storage = StorageBackend(config.get_store_spec(store)) @property def definition(self): @@ -75,91 +73,78 @@ def table_name(self): @property def s3(self): - if self._s3 is None: - self._s3 = s3.Folder(**self.spec) - return self._s3 + """Deprecated: Use storage property instead.""" + warnings.warn( + "ExternalTable.s3 is deprecated. Use ExternalTable.storage instead.", + DeprecationWarning, + stacklevel=2, + ) + # For backward compatibility, return a legacy s3.Folder if needed + from . import s3 + + if not hasattr(self, "_s3_legacy") or self._s3_legacy is None: + self._s3_legacy = s3.Folder(**self.storage.spec) + return self._s3_legacy # - low-level operations - private def _make_external_filepath(self, relative_filepath): """resolve the complete external path based on the relative path""" - # Strip root - if self.spec["protocol"] == "s3": - posix_path = PurePosixPath(PureWindowsPath(self.spec["location"])) + spec = self.storage.spec + # Strip root for S3 paths + if spec["protocol"] == "s3": + posix_path = PurePosixPath(PureWindowsPath(spec["location"])) location_path = ( Path(*posix_path.parts[1:]) - if len(self.spec["location"]) > 0 and any(case in posix_path.parts[0] for case in ("\\", ":")) + if len(spec["location"]) > 0 and any(case in posix_path.parts[0] for case in ("\\", ":")) else Path(posix_path) ) return PurePosixPath(location_path, relative_filepath) - # Preserve root - elif self.spec["protocol"] == "file": - return PurePosixPath(Path(self.spec["location"]), relative_filepath) + # Preserve root for local filesystem + elif spec["protocol"] == "file": + return PurePosixPath(Path(spec["location"]), relative_filepath) else: - assert False + # For other protocols (gcs, azure, etc.), treat like S3 + location = spec.get("location", "") + return PurePosixPath(location, relative_filepath) if location else PurePosixPath(relative_filepath) def _make_uuid_path(self, uuid, suffix=""): """create external path based on the uuid hash""" return self._make_external_filepath( PurePosixPath( self.database, - "/".join(subfold(uuid.hex, self.spec["subfolding"])), + "/".join(subfold(uuid.hex, self.storage.spec["subfolding"])), uuid.hex, ).with_suffix(suffix) ) def _upload_file(self, local_path, external_path, metadata=None): - if self.spec["protocol"] == "s3": - self.s3.fput(local_path, external_path, metadata) - elif self.spec["protocol"] == "file": - safe_copy(local_path, external_path, overwrite=True) - else: - assert False + """Upload a file to external storage using fsspec backend.""" + self.storage.put_file(local_path, external_path, metadata) def _download_file(self, external_path, download_path): - if self.spec["protocol"] == "s3": - self.s3.fget(external_path, download_path) - elif self.spec["protocol"] == "file": - safe_copy(external_path, download_path) - else: - assert False + """Download a file from external storage using fsspec backend.""" + self.storage.get_file(external_path, download_path) def _upload_buffer(self, buffer, external_path): - if self.spec["protocol"] == "s3": - self.s3.put(external_path, buffer) - elif self.spec["protocol"] == "file": - safe_write(external_path, buffer) - else: - assert False + """Upload bytes to external storage using fsspec backend.""" + self.storage.put_buffer(buffer, external_path) def _download_buffer(self, external_path): - if self.spec["protocol"] == "s3": - return self.s3.get(external_path) - if self.spec["protocol"] == "file": - try: - return Path(external_path).read_bytes() - except FileNotFoundError: - raise errors.MissingExternalFile(f"Missing external file {external_path}") from None - assert False + """Download bytes from external storage using fsspec backend.""" + return self.storage.get_buffer(external_path) def _remove_external_file(self, external_path): - if self.spec["protocol"] == "s3": - self.s3.remove_object(external_path) - elif self.spec["protocol"] == "file": - try: - Path(external_path).unlink() - except FileNotFoundError: - pass + """Remove a file from external storage using fsspec backend.""" + self.storage.remove(external_path) def exists(self, external_filepath): """ + Check if an external file is accessible using fsspec backend. + :return: True if the external file is accessible """ - if self.spec["protocol"] == "s3": - return self.s3.exists(external_filepath) - if self.spec["protocol"] == "file": - return Path(external_filepath).is_file() - assert False + return self.storage.exists(external_filepath) # --- BLOBS ---- @@ -250,9 +235,9 @@ def upload_filepath(self, local_filepath): """ local_filepath = Path(local_filepath) try: - relative_filepath = str(local_filepath.relative_to(self.spec["stage"]).as_posix()) + relative_filepath = str(local_filepath.relative_to(self.storage.spec["stage"]).as_posix()) except ValueError: - raise DataJointError("The path {path} is not in stage {stage}".format(path=local_filepath.parent, **self.spec)) + raise DataJointError(f"The path {local_filepath.parent} is not in stage {self.storage.spec['stage']}") uuid = uuid_from_buffer(init_string=relative_filepath) # hash relative path, not contents contents_hash = uuid_from_file(local_filepath) @@ -300,7 +285,7 @@ def _need_checksum(local_filepath, expected_size): "filepath", "contents_hash", "size" ) external_path = self._make_external_filepath(relative_filepath) - local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath + local_filepath = Path(self.storage.spec["stage"]).absolute() / relative_filepath file_exists = Path(local_filepath).is_file() and ( not _need_checksum(local_filepath, size) or uuid_from_file(local_filepath) == contents_hash diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 5d02b52b0..0029a898f 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -10,9 +10,11 @@ from datajoint.condition import Top -from . import blob, hash +from . import hash from .errors import DataJointError +from .objectref import ObjectRef from .settings import config +from .storage import StorageBackend from .utils import safe_write @@ -48,13 +50,23 @@ def _get(connection, attr, data, squeeze, download_path): """ if data is None: return + if attr.is_object: + # Object type - return ObjectRef handle + json_data = json.loads(data) if isinstance(data, str) else data + try: + spec = config.get_object_storage_spec() + backend = StorageBackend(spec) + except DataJointError: + backend = None + return ObjectRef.from_json(json_data, backend=backend) if attr.json: return json.loads(data) extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None - # apply attribute adapter if present - adapt = attr.adapter.get if attr.adapter else lambda x: x + # apply custom attribute type decoder if present + def adapt(x): + return attr.adapter.decode(x, key=None) if attr.adapter else x if attr.is_filepath: return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0]) @@ -87,18 +99,17 @@ def _get(connection, attr, data, squeeze, download_path): safe_write(local_filepath, data.split(b"\0", 1)[1]) return adapt(str(local_filepath)) # download file from remote store - return adapt( - uuid.UUID(bytes=data) - if attr.uuid - else ( - blob.unpack( - extern.get(uuid.UUID(bytes=data)) if attr.is_external else data, - squeeze=squeeze, - ) - if attr.is_blob - else data - ) - ) + if attr.uuid: + return adapt(uuid.UUID(bytes=data)) + elif attr.is_blob: + blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data + # Adapters (like ) handle deserialization in decode() + # Without adapter, blob columns return raw bytes (no deserialization) + if attr.adapter: + return attr.adapter.decode(blob_data, key=None) + return blob_data # raw bytes + else: + return adapt(data) class Fetch: diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 45e35998c..5cd5c44eb 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,7 +5,8 @@ import numpy as np -from .attribute_adapter import AttributeAdapter, get_adapter +from .attribute_adapter import get_adapter +from .attribute_type import AttributeType from .declare import ( EXTERNAL_TYPES, NATIVE_TYPES, @@ -15,6 +16,37 @@ ) from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types + +class _MissingType(AttributeType): + """Placeholder for missing/unregistered attribute types. Raises error on use.""" + + def __init__(self, name: str): + self._name = name + + @property + def type_name(self) -> str: + return self._name + + @property + def dtype(self) -> str: + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def encode(self, value, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def decode(self, stored, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + logger = logging.getLogger(__name__.split(".")[0]) default_attribute_properties = dict( # these default values are set in computed attributes @@ -32,6 +64,7 @@ is_blob=False, is_attachment=False, is_filepath=False, + is_object=False, is_external=False, is_hidden=False, adapter=None, @@ -136,7 +169,11 @@ def blobs(self): @property def non_blobs(self): - return [k for k, v in self.attributes.items() if not (v.is_blob or v.is_attachment or v.is_filepath or v.json)] + return [ + k + for k, v in self.attributes.items() + if not (v.is_blob or v.is_attachment or v.is_filepath or v.is_object or v.json) + ] @property def new_attributes(self): @@ -262,6 +299,7 @@ def _init_from_database(self): json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), is_attachment=False, is_filepath=False, + is_object=False, adapter=None, store=None, is_external=False, @@ -279,7 +317,7 @@ def _init_from_database(self): if special: special = special.groupdict() attr.update(special) - # process adapted attribute types + # process custom attribute types (adapted types) if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): assert context is not None, "Declaration context is not set" adapter_name = special["type"] @@ -287,15 +325,11 @@ def _init_from_database(self): attr.update(adapter=get_adapter(context, adapter_name)) except DataJointError: # if no adapter, then delay the error until the first invocation - attr.update(adapter=AttributeAdapter()) + attr.update(adapter=_MissingType(adapter_name)) else: - attr.update(type=attr["adapter"].attribute_type) + attr.update(type=attr["adapter"].dtype) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): - raise DataJointError( - "Invalid attribute type '{type}' in adapter object <{adapter_name}>.".format( - adapter_name=adapter_name, **attr - ) - ) + raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES) if special: @@ -325,6 +359,7 @@ def _init_from_database(self): unsupported=False, is_attachment=category in ("INTERNAL_ATTACH", "EXTERNAL_ATTACH"), is_filepath=category == "FILEPATH", + is_object=category == "OBJECT", # INTERNAL_BLOB is not a custom type but is included for completeness is_blob=category in ("INTERNAL_BLOB", "EXTERNAL_BLOB"), uuid=category == "UUID", @@ -337,10 +372,13 @@ def _init_from_database(self): attr["is_blob"], attr["is_attachment"], attr["is_filepath"], + attr["is_object"], attr["json"], ) ): - raise DataJointError("Json, Blob, attachment, or filepath attributes are not allowed in the primary key") + raise DataJointError( + "Json, Blob, attachment, filepath, or object attributes " "are not allowed in the primary key" + ) if attr["string"] and attr["default"] is not None and attr["default"] not in sql_literals: attr["default"] = '"%s"' % attr["default"] diff --git a/src/datajoint/jobs.py b/src/datajoint/jobs.py index ff6440495..7dff66333 100644 --- a/src/datajoint/jobs.py +++ b/src/datajoint/jobs.py @@ -1,154 +1,502 @@ +""" +Autopopulate 2.0 Jobs System + +This module implements per-table job tables for auto-populated tables. +Each dj.Imported or dj.Computed table gets its own hidden jobs table +with FK-derived primary keys and rich status tracking. +""" + +import logging import os import platform +from datetime import datetime +from typing import TYPE_CHECKING -from .errors import DuplicateError -from .hash import key_hash +from .errors import DataJointError, DuplicateError +from .expression import QueryExpression from .heading import Heading from .settings import config from .table import Table +if TYPE_CHECKING: + from .autopopulate import AutoPopulate + +logger = logging.getLogger(__name__.split(".")[0]) + ERROR_MESSAGE_LENGTH = 2047 TRUNCATION_APPENDIX = "...truncated" +# Default configuration values +DEFAULT_STALE_TIMEOUT = 3600 # 1 hour +DEFAULT_PRIORITY = 5 +DEFAULT_KEEP_COMPLETED = False -class JobTable(Table): + +class JobsTable(Table): """ - A base table with no definition. Allows reserving jobs + Per-table job queue for auto-populated tables. + + Each dj.Imported or dj.Computed table has an associated hidden jobs table + with the naming convention ~__jobs. + + The jobs table primary key includes only those attributes derived from + foreign keys in the target table's primary key. Additional primary key + attributes (if any) are excluded. + + Status values: + - pending: Job is queued and ready to be processed + - reserved: Job is currently being processed by a worker + - success: Job completed successfully + - error: Job failed with an error + - ignore: Job should be skipped (manually set) """ - def __init__(self, conn, database): - self.database = database - self._connection = conn - self._heading = Heading(table_info=dict(conn=conn, database=database, table_name=self.table_name, context=None)) + def __init__(self, target: "AutoPopulate"): + """ + Initialize a JobsTable for the given auto-populated table. + + Args: + target: The auto-populated table (dj.Imported or dj.Computed) + """ + self._target = target + self._connection = target.connection + self.database = target.database + self._user = self.connection.get_user() + + # Derive the jobs table name from the target table + # e.g., __filtered_image -> _filtered_image__jobs + target_table_name = target.table_name + if target_table_name.startswith("__"): + # Computed table: __foo -> _foo__jobs + self._table_name = f"~{target_table_name[2:]}__jobs" + elif target_table_name.startswith("_"): + # Imported table: _foo -> _foo__jobs + self._table_name = f"~{target_table_name[1:]}__jobs" + else: + # Manual/Lookup (shouldn't happen for auto-populated) + self._table_name = f"~{target_table_name}__jobs" + + # Build the definition dynamically based on target's FK-derived primary key + self._definition = self._build_definition() + + # Initialize heading + self._heading = Heading( + table_info=dict( + conn=self._connection, + database=self.database, + table_name=self.table_name, + context=None, + ) + ) self._support = [self.full_table_name] - self._definition = """ # job reservation table for `{database}` - table_name :varchar(255) # className of the table - key_hash :char(32) # key hash - --- - status :enum('reserved','error','ignore') # if tuple is missing, the job is available - key=null :blob # structure containing the key - error_message="" :varchar({error_message_length}) # error message returned if failed - error_stack=null :mediumblob # error stack if failed - user="" :varchar(255) # database user - host="" :varchar(255) # system hostname - pid=0 :int unsigned # system process id - connection_id = 0 : bigint unsigned # connection_id() - timestamp=CURRENT_TIMESTAMP :timestamp # automatic timestamp - """.format(database=database, error_message_length=ERROR_MESSAGE_LENGTH) + def _get_fk_derived_primary_key(self) -> list[tuple[str, str]]: + """ + Get the FK-derived primary key attributes from the target table. + + Returns: + List of (attribute_name, attribute_type) tuples for FK-derived PK attributes. + """ + # Get parent tables that contribute to the primary key + parents = self._target.parents(primary=True, as_objects=True, foreign_key_info=True) + + # Collect all FK-derived primary key attributes + fk_pk_attrs = set() + for parent_table, props in parents: + # attr_map maps child attr -> parent attr + for child_attr in props["attr_map"].keys(): + fk_pk_attrs.add(child_attr) + + # Get attribute definitions from target table's heading + pk_definitions = [] + for attr_name in self._target.primary_key: + if attr_name in fk_pk_attrs: + attr = self._target.heading.attributes[attr_name] + # Build attribute definition string + attr_def = f"{attr_name} : {attr.type}" + pk_definitions.append((attr_name, attr_def)) + + return pk_definitions + + def _build_definition(self) -> str: + """ + Build the table definition for the jobs table. + + Returns: + DataJoint table definition string. + """ + # Get FK-derived primary key attributes + pk_attrs = self._get_fk_derived_primary_key() + + if not pk_attrs: + raise DataJointError( + f"Cannot create jobs table for {self._target.full_table_name}: " + "no foreign-key-derived primary key attributes found." + ) + + # Build primary key section + pk_section = "\n".join(attr_def for _, attr_def in pk_attrs) + + definition = f"""# Job queue for {self._target.class_name} +{pk_section} +--- +status : enum('pending', 'reserved', 'success', 'error', 'ignore') +priority : int # Lower = more urgent (0 = highest priority) +created_time : datetime(6) # When job was added to queue +scheduled_time : datetime(6) # Process on or after this time +reserved_time=null : datetime(6) # When job was reserved +completed_time=null : datetime(6) # When job completed +duration=null : float # Execution duration in seconds +error_message="" : varchar({ERROR_MESSAGE_LENGTH}) # Error message if failed +error_stack=null : # Full error traceback +user="" : varchar(255) # Database user who reserved/completed job +host="" : varchar(255) # Hostname of worker +pid=0 : int unsigned # Process ID of worker +connection_id=0 : bigint unsigned # MySQL connection ID +version="" : varchar(255) # Code version +""" + return definition + + @property + def definition(self) -> str: + return self._definition + + @property + def table_name(self) -> str: + return self._table_name + + @property + def target(self) -> "AutoPopulate": + """The auto-populated table this jobs table is associated with.""" + return self._target + + def _ensure_declared(self) -> None: + """Ensure the jobs table is declared in the database.""" if not self.is_declared: self.declare() - self._user = self.connection.get_user() + + # --- Status filter properties --- @property - def definition(self): - return self._definition + def pending(self) -> QueryExpression: + """Return query for pending jobs.""" + self._ensure_declared() + return self & 'status="pending"' + + @property + def reserved(self) -> QueryExpression: + """Return query for reserved jobs.""" + self._ensure_declared() + return self & 'status="reserved"' + + @property + def errors(self) -> QueryExpression: + """Return query for error jobs.""" + self._ensure_declared() + return self & 'status="error"' + + @property + def ignored(self) -> QueryExpression: + """Return query for ignored jobs.""" + self._ensure_declared() + return self & 'status="ignore"' @property - def table_name(self): - return "~jobs" + def completed(self) -> QueryExpression: + """Return query for completed (success) jobs.""" + self._ensure_declared() + return self & 'status="success"' + + # --- Core methods --- - def delete(self): - """bypass interactive prompts and dependencies""" + def delete(self) -> None: + """Delete jobs without confirmation (inherits from delete_quick).""" self.delete_quick() - def drop(self): - """bypass interactive prompts and dependencies""" + def drop(self) -> None: + """Drop the jobs table without confirmation.""" self.drop_quick() - def reserve(self, table_name, key): + def refresh( + self, + *restrictions, + delay: float = 0, + priority: int = None, + stale_timeout: float = None, + ) -> dict: """ - Reserve a job for computation. When a job is reserved, the job table contains an entry for the - job key, identified by its hash. When jobs are completed, the entry is removed. + Refresh the jobs queue: add new jobs and remove stale ones. - :param table_name: `database`.`table_name` - :param key: the dict of the job's primary key - :return: True if reserved job successfully. False = the jobs is already taken + Operations performed: + 1. Add new jobs: (key_source & restrictions) - target - jobs → insert as 'pending' + 2. Remove stale jobs: pending jobs older than stale_timeout whose keys + are no longer in key_source + + Args: + restrictions: Conditions to filter key_source + delay: Seconds from now until jobs become available for processing. + Default: 0 (jobs are immediately available). + Uses database server time to avoid clock sync issues. + priority: Priority for new jobs (lower = more urgent). Default from config. + stale_timeout: Seconds after which pending jobs are checked for staleness. + Default from config. + + Returns: + {'added': int, 'removed': int} - counts of jobs added and stale jobs removed """ - job = dict( - table_name=table_name, - key_hash=key_hash(key), - status="reserved", - host=platform.node(), - pid=os.getpid(), - connection_id=self.connection.connection_id, - key=key, - user=self._user, - ) - try: - with config.override(enable_python_native_blobs=True): - self.insert1(job, ignore_extra_fields=True) - except DuplicateError: - return False - return True + self._ensure_declared() + + if priority is None: + priority = config.jobs.default_priority + if stale_timeout is None: + stale_timeout = config.jobs.stale_timeout + + # Get FK-derived primary key attribute names + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] - def ignore(self, table_name, key): + # Step 1: Find new keys to add + # (key_source & restrictions) - target - jobs + key_source = self._target.key_source + if restrictions: + from .expression import AndList + + key_source = key_source & AndList(restrictions) + + # Project to FK-derived attributes only + key_source_proj = key_source.proj(*pk_attrs) + target_proj = self._target.proj(*pk_attrs) + existing_jobs = self.proj() # jobs table PK is the FK-derived attrs + + # Keys that need jobs: in key_source, not in target, not already in jobs + new_keys = (key_source_proj - target_proj - existing_jobs).fetch("KEY") + + # Insert new jobs + added = 0 + for key in new_keys: + try: + self._insert_job_with_delay(key, priority, delay) + added += 1 + except DuplicateError: + # Job was added by another process + pass + + # Step 2: Remove stale pending jobs + # Find pending jobs older than stale_timeout whose keys are not in key_source + removed = 0 + if stale_timeout > 0: + stale_condition = f'status="pending" AND ' f"created_time < NOW() - INTERVAL {stale_timeout} SECOND" + stale_jobs = (self & stale_condition).proj() + + # Check which stale jobs are no longer in key_source + orphaned_keys = (stale_jobs - key_source_proj).fetch("KEY") + for key in orphaned_keys: + (self & key).delete_quick() + removed += 1 + + return {"added": added, "removed": removed} + + def _insert_job_with_delay(self, key: dict, priority: int, delay: float) -> None: """ - Set a job to be ignored for computation. When a job is ignored, the job table contains an entry for the - job key, identified by its hash, with status "ignore". + Insert a new job with scheduled_time set using database server time. Args: - table_name: - Table name (str) - `database`.`table_name` - key: - The dict of the job's primary key + key: Primary key dict for the job + priority: Job priority (lower = more urgent) + delay: Seconds from now until job becomes available + """ + # Build column names and values + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] + columns = pk_attrs + ["status", "priority", "created_time", "scheduled_time", "user", "host", "pid", "connection_id"] - Returns: - True if ignore job successfully. False = the jobs is already taken - """ - job = dict( - table_name=table_name, - key_hash=key_hash(key), - status="ignore", - host=platform.node(), - pid=os.getpid(), - connection_id=self.connection.connection_id, - key=key, - user=self._user, - ) - try: - with config.override(enable_python_native_blobs=True): - self.insert1(job, ignore_extra_fields=True) - except DuplicateError: - return False - return True + # Build values + pk_values = [f"'{key[attr]}'" if isinstance(key[attr], str) else str(key[attr]) for attr in pk_attrs] + other_values = [ + "'pending'", + str(priority), + "NOW(6)", # created_time + f"NOW(6) + INTERVAL {delay} SECOND" if delay > 0 else "NOW(6)", # scheduled_time + f"'{self._user}'", + f"'{platform.node()}'", + str(os.getpid()), + str(self.connection.connection_id), + ] - def complete(self, table_name, key): + sql = f""" + INSERT INTO {self.full_table_name} + ({', '.join(f'`{c}`' for c in columns)}) + VALUES ({', '.join(pk_values + other_values)}) """ - Log a completed job. When a job is completed, its reservation entry is deleted. + self.connection.query(sql) - :param table_name: `database`.`table_name` - :param key: the dict of the job's primary key + def reserve(self, key: dict) -> None: """ - job_key = dict(table_name=table_name, key_hash=key_hash(key)) - (self & job_key).delete_quick() + Reserve a job for processing. + + Updates the job record to 'reserved' status. The caller (populate) is + responsible for verifying the job is pending before calling this method. - def error(self, table_name, key, error_message, error_stack=None): + Args: + key: Primary key dict for the job """ - Log an error message. The job reservation is replaced with an error entry. - if an error occurs, leave an entry describing the problem + self._ensure_declared() + + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] + job_key = {attr: key[attr] for attr in pk_attrs if attr in key} - :param table_name: `database`.`table_name` - :param key: the dict of the job's primary key - :param error_message: string error message - :param error_stack: stack trace + update_row = { + **job_key, + "status": "reserved", + "reserved_time": datetime.now(), + "user": self._user, + "host": platform.node(), + "pid": os.getpid(), + "connection_id": self.connection.connection_id, + } + self.update1(update_row) + + def complete(self, key: dict, duration: float = None, keep: bool = None) -> None: + """ + Mark a job as successfully completed. + + Args: + key: Primary key dict for the job + duration: Execution duration in seconds + keep: If True, mark as 'success'. If False, delete the job entry. + Default from config (jobs.keep_completed). """ + self._ensure_declared() + + if keep is None: + keep = config.jobs.keep_completed + + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] + job_key = {attr: key[attr] for attr in pk_attrs if attr in key} + + if keep: + # Update to success status + update_row = { + **job_key, + "status": "success", + "completed_time": datetime.now(), + } + if duration is not None: + update_row["duration"] = duration + self.update1(update_row) + else: + # Delete the job entry + (self & job_key).delete_quick() + + def error(self, key: dict, error_message: str, error_stack: str = None) -> None: + """ + Mark a job as failed with error details. + + Args: + key: Primary key dict for the job + error_message: Error message string + error_stack: Full stack trace + """ + self._ensure_declared() + + # Truncate error message if necessary if len(error_message) > ERROR_MESSAGE_LENGTH: error_message = error_message[: ERROR_MESSAGE_LENGTH - len(TRUNCATION_APPENDIX)] + TRUNCATION_APPENDIX - with config.override(enable_python_native_blobs=True): - self.insert1( - dict( - table_name=table_name, - key_hash=key_hash(key), - status="error", - host=platform.node(), - pid=os.getpid(), - connection_id=self.connection.connection_id, - user=self._user, - key=key, - error_message=error_message, - error_stack=error_stack, - ), - replace=True, - ignore_extra_fields=True, - ) + + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] + job_key = {attr: key[attr] for attr in pk_attrs if attr in key} + + # Build update dict with all required fields + update_row = { + **job_key, + "status": "error", + "completed_time": datetime.now(), + "error_message": error_message, + } + if error_stack is not None: + update_row["error_stack"] = error_stack + + self.update1(update_row) + + def ignore(self, key: dict) -> None: + """ + Mark a key to be ignored (skipped during populate). + + Only inserts new records. Existing job entries cannot be converted to + ignore status - they must be cleared first. + + Args: + key: Primary key dict for the job + """ + self._ensure_declared() + + pk_attrs = [name for name, _ in self._get_fk_derived_primary_key()] + job_key = {attr: key[attr] for attr in pk_attrs if attr in key} + + try: + self._insert_job_with_status(job_key, "ignore") + except DuplicateError: + pass # Already tracked + + def _insert_job_with_status(self, key: dict, status: str) -> None: + """Insert a new job with the given status.""" + now = datetime.now() + row = { + **key, + "status": status, + "priority": DEFAULT_PRIORITY, + "created_time": now, + "scheduled_time": now, + "user": self._user, + "host": platform.node(), + "pid": os.getpid(), + "connection_id": self.connection.connection_id, + } + self.insert1(row) + + def progress(self) -> dict: + """ + Report detailed progress of job processing. + + Returns: + Dict with counts for each status and total. + """ + self._ensure_declared() + + result = { + "pending": len(self.pending), + "reserved": len(self.reserved), + "success": len(self.completed), + "error": len(self.errors), + "ignore": len(self.ignored), + } + result["total"] = sum(result.values()) + return result + + def fetch_pending( + self, + limit: int = None, + priority: int = None, + ) -> list[dict]: + """ + Fetch pending jobs ordered by priority and scheduled time. + + Args: + limit: Maximum number of jobs to fetch + priority: Only fetch jobs at this priority or more urgent (lower values) + + Returns: + List of job key dicts + """ + self._ensure_declared() + + # Build query for non-stale pending jobs + query = self & 'status="pending" AND scheduled_time <= NOW(6)' + + if priority is not None: + query = query & f"priority <= {priority}" + + # Fetch with ordering + return query.fetch( + "KEY", + order_by=["priority ASC", "scheduled_time ASC"], + limit=limit, + ) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py new file mode 100644 index 000000000..696ca380e --- /dev/null +++ b/src/datajoint/migrate.py @@ -0,0 +1,250 @@ +""" +Migration utilities for DataJoint schema updates. + +This module provides tools for migrating existing schemas to use the new +AttributeType system, particularly for upgrading blob columns to use +explicit `` type declarations. +""" + +from __future__ import annotations + +import logging +import re +from typing import TYPE_CHECKING + +from .errors import DataJointError + +if TYPE_CHECKING: + from .schemas import Schema + +logger = logging.getLogger(__name__.split(".")[0]) + +# Pattern to detect blob types +BLOB_TYPES = re.compile(r"^(tiny|small|medium|long|)blob$", re.I) + + +def analyze_blob_columns(schema: Schema) -> list[dict]: + """ + Analyze a schema to find blob columns that could be migrated to . + + This function identifies blob columns that: + 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) + 2. Do NOT already have an adapter/type specified in their comment + + All blob size variants are included in the analysis. + + Args: + schema: The DataJoint schema to analyze. + + Returns: + List of dicts with keys: + - table_name: Full table name (database.table) + - column_name: Name of the blob column + - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob) + - current_comment: Current column comment + - needs_migration: True if column should be migrated + + Example: + >>> import datajoint as dj + >>> schema = dj.schema('my_database') + >>> columns = dj.migrate.analyze_blob_columns(schema) + >>> for col in columns: + ... if col['needs_migration']: + ... print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})") + """ + results = [] + + connection = schema.connection + + # Get all tables in the schema + tables_query = """ + SELECT TABLE_NAME + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_NAME NOT LIKE '~%%' + """ + + tables = connection.query(tables_query, args=(schema.database,)).fetchall() + + for (table_name,) in tables: + # Get column information for each table + columns_query = """ + SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s + AND TABLE_NAME = %s + AND DATA_TYPE IN ('tinyblob', 'blob', 'mediumblob', 'longblob') + """ + + columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() + + for column_name, column_type, comment in columns: + # Check if comment already has an adapter type (starts with :type:) + has_adapter = comment and comment.startswith(":") + + results.append( + { + "table_name": f"{schema.database}.{table_name}", + "column_name": column_name, + "column_type": column_type, + "current_comment": comment or "", + "needs_migration": not has_adapter, + } + ) + + return results + + +def generate_migration_sql( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> list[str]: + """ + Generate SQL statements to migrate blob columns to use . + + This generates ALTER TABLE statements that update column comments to + include the `::` prefix, marking them as using explicit + DataJoint blob serialization. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only return SQL without executing. + + Returns: + List of SQL ALTER TABLE statements. + + Example: + >>> sql_statements = dj.migrate.generate_migration_sql(schema) + >>> for sql in sql_statements: + ... print(sql) + + Note: + This is a metadata-only migration. The actual blob data format + remains unchanged - only the column comments are updated to + indicate explicit type handling. + """ + columns = analyze_blob_columns(schema) + sql_statements = [] + + for col in columns: + if not col["needs_migration"]: + continue + + # Build new comment with type prefix + old_comment = col["current_comment"] + new_comment = f":<{target_type}>:{old_comment}" + + # Escape special characters for SQL + new_comment_escaped = new_comment.replace("\\", "\\\\").replace("'", "\\'") + + # Parse table name + db_name, table_name = col["table_name"].split(".") + + # Generate ALTER TABLE statement + sql = ( + f"ALTER TABLE `{db_name}`.`{table_name}` " + f"MODIFY COLUMN `{col['column_name']}` {col['column_type']} " + f"COMMENT '{new_comment_escaped}'" + ) + sql_statements.append(sql) + + return sql_statements + + +def migrate_blob_columns( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> dict: + """ + Migrate blob columns in a schema to use explicit type. + + This updates column comments in the database to include the type + declaration. The data format remains unchanged. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only preview changes without applying. + + Returns: + Dict with keys: + - analyzed: Number of blob columns analyzed + - needs_migration: Number of columns that need migration + - migrated: Number of columns migrated (0 if dry_run) + - sql_statements: List of SQL statements (executed or to be executed) + + Example: + >>> # Preview migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True) + >>> print(f"Would migrate {result['needs_migration']} columns") + + >>> # Apply migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False) + >>> print(f"Migrated {result['migrated']} columns") + + Warning: + After migration, table definitions should be updated to use + `` instead of `longblob` for consistency. The migration + only updates database metadata; source code changes are manual. + """ + columns = analyze_blob_columns(schema) + sql_statements = generate_migration_sql(schema, target_type=target_type) + + result = { + "analyzed": len(columns), + "needs_migration": sum(1 for c in columns if c["needs_migration"]), + "migrated": 0, + "sql_statements": sql_statements, + } + + if dry_run: + logger.info(f"Dry run: would migrate {result['needs_migration']} columns") + for sql in sql_statements: + logger.info(f" {sql}") + return result + + # Execute migrations + connection = schema.connection + for sql in sql_statements: + try: + connection.query(sql) + result["migrated"] += 1 + logger.info(f"Executed: {sql}") + except Exception as e: + logger.error(f"Failed to execute: {sql}\nError: {e}") + raise DataJointError(f"Migration failed: {e}") from e + + logger.info(f"Successfully migrated {result['migrated']} columns") + return result + + +def check_migration_status(schema: Schema) -> dict: + """ + Check the migration status of blob columns in a schema. + + Args: + schema: The DataJoint schema to check. + + Returns: + Dict with keys: + - total_blob_columns: Total number of blob columns + - migrated: Number of columns with explicit type + - pending: Number of columns using implicit serialization + - columns: List of column details + + Example: + >>> status = dj.migrate.check_migration_status(schema) + >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}") + """ + columns = analyze_blob_columns(schema) + + return { + "total_blob_columns": len(columns), + "migrated": sum(1 for c in columns if not c["needs_migration"]), + "pending": sum(1 for c in columns if c["needs_migration"]), + "columns": columns, + } diff --git a/src/datajoint/objectref.py b/src/datajoint/objectref.py new file mode 100644 index 000000000..32f7b1669 --- /dev/null +++ b/src/datajoint/objectref.py @@ -0,0 +1,360 @@ +""" +ObjectRef class for handling fetched object type attributes. + +This module provides the ObjectRef class which represents a reference to a file +or folder stored in the pipeline's object storage backend. It provides metadata +access and direct fsspec-based file operations. +""" + +import json +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import IO, Iterator + +import fsspec + +from .errors import DataJointError +from .storage import StorageBackend + + +class IntegrityError(DataJointError): + """Raised when object integrity verification fails.""" + + pass + + +@dataclass +class ObjectRef: + """ + Handle to a file or folder stored in the pipeline's object storage backend. + + This class is returned when fetching object-type attributes. It provides + metadata access without I/O, and methods for reading content directly + from the storage backend. + + Attributes: + path: Full path/key within storage backend (includes token) + size: Total size in bytes (sum for folders), or None if not computed. + For large hierarchical data like Zarr stores, size computation can + be expensive and is optional. + hash: Content hash with algorithm prefix, or None if not computed + ext: File extension as tooling hint (e.g., ".dat", ".zarr") or None. + This is a conventional suffix for tooling, not a content-type declaration. + is_dir: True if stored content is a directory/key-prefix (e.g., Zarr store) + timestamp: ISO 8601 upload timestamp + mime_type: MIME type (files only, auto-detected from extension) + item_count: Number of files (folders only), or None if not computed + """ + + path: str + size: int | None + hash: str | None + ext: str | None + is_dir: bool + timestamp: datetime + mime_type: str | None = None + item_count: int | None = None + _backend: StorageBackend | None = None + + @classmethod + def from_json(cls, json_data: dict | str, backend: StorageBackend | None = None) -> "ObjectRef": + """ + Create an ObjectRef from JSON metadata stored in the database. + + Args: + json_data: JSON string or dict containing object metadata + backend: StorageBackend instance for file operations + + Returns: + ObjectRef instance + """ + if isinstance(json_data, str): + data = json.loads(json_data) + else: + data = json_data + + timestamp = data.get("timestamp") + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp.replace("Z", "+00:00")) + + return cls( + path=data["path"], + size=data["size"], + hash=data.get("hash"), + ext=data.get("ext"), + is_dir=data.get("is_dir", False), + timestamp=timestamp, + mime_type=data.get("mime_type"), + item_count=data.get("item_count"), + _backend=backend, + ) + + def to_json(self) -> dict: + """ + Convert ObjectRef to JSON-serializable dict for database storage. + + Returns: + Dict suitable for JSON serialization + """ + data = { + "path": self.path, + "size": self.size, + "hash": self.hash, + "ext": self.ext, + "is_dir": self.is_dir, + "timestamp": self.timestamp.isoformat() if self.timestamp else None, + } + if self.mime_type: + data["mime_type"] = self.mime_type + if self.item_count is not None: + data["item_count"] = self.item_count + return data + + def _ensure_backend(self): + """Ensure storage backend is available for I/O operations.""" + if self._backend is None: + raise DataJointError( + "ObjectRef has no storage backend configured. " + "This usually means the object was created without a connection context." + ) + + @property + def fs(self) -> fsspec.AbstractFileSystem: + """ + Return fsspec filesystem for direct access. + + This allows integration with libraries like Zarr and xarray that + work with fsspec filesystems. + """ + self._ensure_backend() + return self._backend.fs + + @property + def store(self) -> fsspec.FSMap: + """ + Return FSMap suitable for Zarr/xarray. + + This provides a dict-like interface to the storage location, + compatible with zarr.open() and xarray.open_zarr(). + """ + self._ensure_backend() + full_path = self._backend._full_path(self.path) + return fsspec.FSMap(full_path, self._backend.fs) + + @property + def full_path(self) -> str: + """ + Return full URI (e.g., 's3://bucket/path'). + + This is the complete path including protocol and bucket/location. + """ + self._ensure_backend() + protocol = self._backend.protocol + if protocol == "file": + return str(Path(self._backend.spec.get("location", "")) / self.path) + elif protocol == "s3": + bucket = self._backend.spec["bucket"] + return f"s3://{bucket}/{self.path}" + elif protocol == "gcs": + bucket = self._backend.spec["bucket"] + return f"gs://{bucket}/{self.path}" + elif protocol == "azure": + container = self._backend.spec["container"] + return f"az://{container}/{self.path}" + else: + return self.path + + def read(self) -> bytes: + """ + Read entire file content as bytes. + + Returns: + File contents as bytes + + Raises: + DataJointError: If object is a directory + """ + if self.is_dir: + raise DataJointError("Cannot read() a directory. Use listdir() or walk() instead.") + self._ensure_backend() + return self._backend.get_buffer(self.path) + + def open(self, subpath: str | None = None, mode: str = "rb") -> IO: + """ + Open file for reading. + + Args: + subpath: Optional path within directory (for folder objects) + mode: File mode ('rb' for binary read, 'r' for text) + + Returns: + File-like object + """ + self._ensure_backend() + path = self.path + if subpath: + if not self.is_dir: + raise DataJointError("Cannot use subpath on a file object") + path = f"{self.path}/{subpath}" + return self._backend.open(path, mode) + + def listdir(self, subpath: str = "") -> list[str]: + """ + List contents of directory. + + Args: + subpath: Optional subdirectory path + + Returns: + List of filenames/directory names + """ + if not self.is_dir: + raise DataJointError("Cannot listdir() on a file. Use read() or open() instead.") + self._ensure_backend() + path = f"{self.path}/{subpath}" if subpath else self.path + full_path = self._backend._full_path(path) + entries = self._backend.fs.ls(full_path, detail=False) + # Return just the basename of each entry + return [e.split("/")[-1] for e in entries] + + def walk(self) -> Iterator[tuple[str, list[str], list[str]]]: + """ + Walk directory tree, similar to os.walk(). + + Yields: + Tuples of (dirpath, dirnames, filenames) + """ + if not self.is_dir: + raise DataJointError("Cannot walk() on a file.") + self._ensure_backend() + full_path = self._backend._full_path(self.path) + for root, dirs, files in self._backend.fs.walk(full_path): + # Make paths relative to the object root + rel_root = root[len(full_path) :].lstrip("/") + yield rel_root, dirs, files + + def download(self, destination: Path | str, subpath: str | None = None) -> Path: + """ + Download object to local filesystem. + + Args: + destination: Local directory or file path + subpath: Optional path within directory (for folder objects) + + Returns: + Path to downloaded file/directory + """ + self._ensure_backend() + destination = Path(destination) + + if subpath: + if not self.is_dir: + raise DataJointError("Cannot use subpath on a file object") + remote_path = f"{self.path}/{subpath}" + else: + remote_path = self.path + + if self.is_dir and not subpath: + # Download entire directory + destination.mkdir(parents=True, exist_ok=True) + full_path = self._backend._full_path(remote_path) + self._backend.fs.get(full_path, str(destination), recursive=True) + else: + # Download single file + if destination.is_dir(): + filename = remote_path.split("/")[-1] + destination = destination / filename + destination.parent.mkdir(parents=True, exist_ok=True) + self._backend.get_file(remote_path, destination) + + return destination + + def exists(self, subpath: str | None = None) -> bool: + """ + Check if object (or subpath within it) exists. + + Args: + subpath: Optional path within directory + + Returns: + True if exists + """ + self._ensure_backend() + path = f"{self.path}/{subpath}" if subpath else self.path + return self._backend.exists(path) + + def verify(self) -> bool: + """ + Verify object integrity. + + For files: checks size matches, and hash if available. + For folders: validates manifest (all files exist with correct sizes). + + Returns: + True if valid + + Raises: + IntegrityError: If verification fails with details + """ + self._ensure_backend() + + if self.is_dir: + return self._verify_folder() + else: + return self._verify_file() + + def _verify_file(self) -> bool: + """Verify a single file.""" + # Check existence + if not self._backend.exists(self.path): + raise IntegrityError(f"File does not exist: {self.path}") + + # Check size if available + if self.size is not None: + actual_size = self._backend.size(self.path) + if actual_size != self.size: + raise IntegrityError(f"Size mismatch for {self.path}: expected {self.size}, got {actual_size}") + + # Check hash if available + if self.hash: + # TODO: Implement hash verification + pass + + return True + + def _verify_folder(self) -> bool: + """Verify a folder using its manifest.""" + manifest_path = f"{self.path}.manifest.json" + + if not self._backend.exists(manifest_path): + raise IntegrityError(f"Manifest file missing: {manifest_path}") + + # Read manifest + manifest_data = self._backend.get_buffer(manifest_path) + manifest = json.loads(manifest_data) + + # Verify each file in manifest + errors = [] + for file_info in manifest.get("files", []): + file_path = f"{self.path}/{file_info['path']}" + expected_size = file_info["size"] + + if not self._backend.exists(file_path): + errors.append(f"Missing file: {file_info['path']}") + else: + actual_size = self._backend.size(file_path) + if actual_size != expected_size: + errors.append(f"Size mismatch for {file_info['path']}: expected {expected_size}, got {actual_size}") + + if errors: + raise IntegrityError("Folder verification failed:\n" + "\n".join(errors)) + + return True + + def __repr__(self) -> str: + type_str = "folder" if self.is_dir else "file" + return f"ObjectRef({type_str}: {self.path}, size={self.size})" + + def __str__(self) -> str: + return self.path diff --git a/src/datajoint/s3.py b/src/datajoint/s3.py index e107a7f4b..2e2ea151a 100644 --- a/src/datajoint/s3.py +++ b/src/datajoint/s3.py @@ -1,9 +1,19 @@ """ -AWS S3 operations +AWS S3 operations using minio client. + +.. deprecated:: 0.15.0 + This module is deprecated. Use :mod:`datajoint.storage` with fsspec backend instead. + The minio-based S3 client will be removed in a future version. + + Migration guide: + - Instead of importing from datajoint.s3, use datajoint.storage.StorageBackend + - StorageBackend provides a unified interface for all storage protocols + - See datajoint.storage module for details """ import logging import uuid +import warnings from io import BytesIO from pathlib import Path @@ -17,7 +27,10 @@ class Folder: """ - A Folder instance manipulates a flat folder of objects within an S3-compatible object store + A Folder instance manipulates a flat folder of objects within an S3-compatible object store. + + .. deprecated:: 0.15.0 + Use :class:`datajoint.storage.StorageBackend` instead. """ def __init__( @@ -31,6 +44,12 @@ def __init__( proxy_server=None, **_, ): + warnings.warn( + "datajoint.s3.Folder is deprecated and will be removed in a future version. " + "Use datajoint.storage.StorageBackend with fsspec instead.", + DeprecationWarning, + stacklevel=2, + ) # from https://docs.min.io/docs/python-client-api-reference self.client = minio.Minio( endpoint, diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index e9b83efff..9df3ba34d 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -10,7 +10,6 @@ from .errors import AccessError, DataJointError from .external import ExternalMapping from .heading import Heading -from .jobs import JobTable from .settings import config from .table import FreeTable, Log, lookup_class_name from .user_tables import Computed, Imported, Lookup, Manual, Part, _get_tier @@ -70,7 +69,7 @@ def __init__( self.context = context self.create_schema = create_schema self.create_tables = create_tables - self._jobs = None + self._auto_populated_tables = [] # Track auto-populated table classes self.external = ExternalMapping(self) self.add_objects = add_objects self.declare_list = [] @@ -227,6 +226,11 @@ def _decorate_table(self, table_class, context, assert_declared=False): else: instance.insert(contents, skip_duplicates=True) + # Track auto-populated tables for schema.jobs + if isinstance(instance, (Imported, Computed)) and not isinstance(instance, Part): + if table_class not in self._auto_populated_tables: + self._auto_populated_tables.append(table_class) + @property def log(self): self._assert_exists() @@ -338,14 +342,15 @@ def exists(self): @property def jobs(self): """ - schema.jobs provides a view of the job reservation table for the schema + Access job tables for all auto-populated tables in the schema. + + Returns a list of JobsTable objects, one for each Imported or Computed + table in the schema. - :return: jobs table + :return: list of JobsTable objects """ self._assert_exists() - if self._jobs is None: - self._jobs = JobTable(self.connection, self.database) - return self._jobs + return [table_class().jobs for table_class in self._auto_populated_tables] @property def code(self): diff --git a/src/datajoint/settings.py b/src/datajoint/settings.py index 65b91aa2c..322aca099 100644 --- a/src/datajoint/settings.py +++ b/src/datajoint/settings.py @@ -188,6 +188,50 @@ class ExternalSettings(BaseSettings): aws_secret_access_key: SecretStr | None = Field(default=None, validation_alias="DJ_AWS_SECRET_ACCESS_KEY") +class JobsSettings(BaseSettings): + """Job queue settings for auto-populated tables.""" + + model_config = SettingsConfigDict( + env_prefix="DJ_JOBS_", + case_sensitive=False, + extra="forbid", + validate_assignment=True, + ) + + auto_refresh: bool = Field(default=True, description="Auto-refresh on populate") + keep_completed: bool = Field(default=False, description="Keep success records in jobs table") + stale_timeout: int = Field(default=3600, description="Seconds before pending job is considered stale") + default_priority: int = Field(default=5, description="Default priority for new jobs (lower = more urgent)") + + +class ObjectStorageSettings(BaseSettings): + """Object storage configuration for the object type.""" + + model_config = SettingsConfigDict( + env_prefix="DJ_OBJECT_STORAGE_", + case_sensitive=False, + extra="forbid", + validate_assignment=True, + ) + + # Required settings + project_name: str | None = Field(default=None, description="Unique project identifier") + protocol: str | None = Field(default=None, description="Storage protocol: file, s3, gcs, azure") + location: str | None = Field(default=None, description="Base path or bucket prefix") + + # Cloud storage settings + bucket: str | None = Field(default=None, description="Bucket name (S3, GCS)") + container: str | None = Field(default=None, description="Container name (Azure)") + endpoint: str | None = Field(default=None, description="S3 endpoint URL") + access_key: str | None = Field(default=None, description="Access key") + secret_key: SecretStr | None = Field(default=None, description="Secret key") + secure: bool = Field(default=True, description="Use HTTPS") + + # Optional settings + partition_pattern: str | None = Field(default=None, description="Path pattern with {attribute} placeholders") + token_length: int = Field(default=8, ge=4, le=16, description="Random suffix length for filenames") + + class Config(BaseSettings): """ Main DataJoint configuration. @@ -219,6 +263,8 @@ class Config(BaseSettings): connection: ConnectionSettings = Field(default_factory=ConnectionSettings) display: DisplaySettings = Field(default_factory=DisplaySettings) external: ExternalSettings = Field(default_factory=ExternalSettings) + jobs: JobsSettings = Field(default_factory=JobsSettings) + object_storage: ObjectStorageSettings = Field(default_factory=ObjectStorageSettings) # Top-level settings loglevel: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default="INFO", validation_alias="DJ_LOG_LEVEL") @@ -275,13 +321,19 @@ def get_store_spec(self, store: str) -> dict[str, Any]: # Validate protocol protocol = spec.get("protocol", "").lower() - if protocol not in ("file", "s3"): - raise DataJointError(f'Missing or invalid protocol in config.stores["{store}"]') + supported_protocols = ("file", "s3", "gcs", "azure") + if protocol not in supported_protocols: + raise DataJointError( + f'Missing or invalid protocol in config.stores["{store}"]. ' + f'Supported protocols: {", ".join(supported_protocols)}' + ) # Define required and allowed keys by protocol required_keys: dict[str, tuple[str, ...]] = { "file": ("protocol", "location"), "s3": ("protocol", "endpoint", "bucket", "access_key", "secret_key", "location"), + "gcs": ("protocol", "bucket", "location"), + "azure": ("protocol", "container", "location"), } allowed_keys: dict[str, tuple[str, ...]] = { "file": ("protocol", "location", "subfolding", "stage"), @@ -297,6 +349,25 @@ def get_store_spec(self, store: str) -> dict[str, Any]: "stage", "proxy_server", ), + "gcs": ( + "protocol", + "bucket", + "location", + "token", + "project", + "subfolding", + "stage", + ), + "azure": ( + "protocol", + "container", + "location", + "account_name", + "account_key", + "connection_string", + "subfolding", + "stage", + ), } # Check required keys @@ -311,6 +382,73 @@ def get_store_spec(self, store: str) -> dict[str, Any]: return spec + def get_object_storage_spec(self) -> dict[str, Any]: + """ + Get validated object storage configuration. + + Returns: + Object storage configuration dict + + Raises: + DataJointError: If object storage is not configured or has invalid config + """ + os_settings = self.object_storage + + # Check if object storage is configured + if not os_settings.protocol: + raise DataJointError( + "Object storage is not configured. Set object_storage.protocol in datajoint.json " + "or DJ_OBJECT_STORAGE_PROTOCOL environment variable." + ) + + if not os_settings.project_name: + raise DataJointError( + "Object storage project_name is required. Set object_storage.project_name in datajoint.json " + "or DJ_OBJECT_STORAGE_PROJECT_NAME environment variable." + ) + + protocol = os_settings.protocol.lower() + supported_protocols = ("file", "s3", "gcs", "azure") + if protocol not in supported_protocols: + raise DataJointError( + f"Invalid object_storage.protocol: {protocol}. " f'Supported protocols: {", ".join(supported_protocols)}' + ) + + # Build spec dict + spec = { + "project_name": os_settings.project_name, + "protocol": protocol, + "location": os_settings.location or "", + "partition_pattern": os_settings.partition_pattern, + "token_length": os_settings.token_length, + } + + # Add protocol-specific settings + if protocol == "s3": + if not os_settings.endpoint or not os_settings.bucket: + raise DataJointError("object_storage.endpoint and object_storage.bucket are required for S3") + if not os_settings.access_key or not os_settings.secret_key: + raise DataJointError("object_storage.access_key and object_storage.secret_key are required for S3") + spec.update( + { + "endpoint": os_settings.endpoint, + "bucket": os_settings.bucket, + "access_key": os_settings.access_key, + "secret_key": os_settings.secret_key.get_secret_value() if os_settings.secret_key else None, + "secure": os_settings.secure, + } + ) + elif protocol == "gcs": + if not os_settings.bucket: + raise DataJointError("object_storage.bucket is required for GCS") + spec["bucket"] = os_settings.bucket + elif protocol == "azure": + if not os_settings.container: + raise DataJointError("object_storage.container is required for Azure") + spec["container"] = os_settings.container + + return spec + def load(self, filename: str | Path) -> None: """ Load settings from a JSON file. diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py new file mode 100644 index 000000000..9083bb78b --- /dev/null +++ b/src/datajoint/staged_insert.py @@ -0,0 +1,315 @@ +""" +Staged insert context manager for direct object storage writes. + +This module provides the StagedInsert class which allows writing directly +to object storage before finalizing the database insert. +""" + +import json +import mimetypes +from contextlib import contextmanager +from datetime import datetime, timezone +from typing import IO, Any + +import fsspec + +from .errors import DataJointError +from .settings import config +from .storage import StorageBackend, build_object_path + + +class StagedInsert: + """ + Context manager for staged insert operations. + + Allows direct writes to object storage before finalizing the database insert. + Used for large objects like Zarr arrays where copying from local storage + is inefficient. + + Usage: + with table.staged_insert1 as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Create object storage directly + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) + z[:] = data + + # Assign to record + staged.rec['raw_data'] = z + + # On successful exit: metadata computed, record inserted + # On exception: storage cleaned up, no record inserted + """ + + def __init__(self, table): + """ + Initialize a staged insert. + + Args: + table: The Table instance to insert into + """ + self._table = table + self._rec: dict[str, Any] = {} + self._staged_objects: dict[str, dict] = {} # field -> {path, ext, token} + self._backend: StorageBackend | None = None + + @property + def rec(self) -> dict[str, Any]: + """Record dict for setting attribute values.""" + return self._rec + + @property + def fs(self) -> fsspec.AbstractFileSystem: + """Return fsspec filesystem for advanced operations.""" + self._ensure_backend() + return self._backend.fs + + def _ensure_backend(self): + """Ensure storage backend is initialized.""" + if self._backend is None: + try: + spec = config.get_object_storage_spec() + self._backend = StorageBackend(spec) + except DataJointError: + raise DataJointError( + "Object storage is not configured. Set object_storage settings in datajoint.json " + "or DJ_OBJECT_STORAGE_* environment variables." + ) + + def _get_storage_path(self, field: str, ext: str = "") -> str: + """ + Get or create the storage path for a field. + + Args: + field: Name of the object attribute + ext: Optional extension (e.g., ".zarr") + + Returns: + Full storage path + """ + self._ensure_backend() + + if field in self._staged_objects: + return self._staged_objects[field]["full_path"] + + # Validate field is an object attribute + if field not in self._table.heading: + raise DataJointError(f"Attribute '{field}' not found in table heading") + + attr = self._table.heading[field] + if not attr.is_object: + raise DataJointError(f"Attribute '{field}' is not an object type") + + # Extract primary key from rec + primary_key = {k: self._rec[k] for k in self._table.primary_key if k in self._rec} + if len(primary_key) != len(self._table.primary_key): + raise DataJointError( + "Primary key values must be set in staged.rec before calling store() or open(). " + f"Missing: {set(self._table.primary_key) - set(primary_key)}" + ) + + # Get storage spec + spec = config.get_object_storage_spec() + partition_pattern = spec.get("partition_pattern") + token_length = spec.get("token_length", 8) + location = spec.get("location", "") + + # Build storage path + relative_path, token = build_object_path( + schema=self._table.database, + table=self._table.class_name, + field=field, + primary_key=primary_key, + ext=ext if ext else None, + partition_pattern=partition_pattern, + token_length=token_length, + ) + + # Full path with location prefix + full_path = f"{location}/{relative_path}" if location else relative_path + + # Store staged object info + self._staged_objects[field] = { + "relative_path": relative_path, + "full_path": full_path, + "ext": ext if ext else None, + "token": token, + } + + return full_path + + def store(self, field: str, ext: str = "") -> fsspec.FSMap: + """ + Get an FSMap store for direct writes to an object field. + + Args: + field: Name of the object attribute + ext: Optional extension (e.g., ".zarr", ".hdf5") + + Returns: + fsspec.FSMap suitable for Zarr/xarray + """ + path = self._get_storage_path(field, ext) + return self._backend.get_fsmap(path) + + def open(self, field: str, ext: str = "", mode: str = "wb") -> IO: + """ + Open a file for direct writes to an object field. + + Args: + field: Name of the object attribute + ext: Optional extension (e.g., ".bin", ".dat") + mode: File mode (default: "wb") + + Returns: + File-like object for writing + """ + path = self._get_storage_path(field, ext) + return self._backend.open(path, mode) + + def _compute_metadata(self, field: str) -> dict: + """ + Compute metadata for a staged object after writing is complete. + + Args: + field: Name of the object attribute + + Returns: + JSON-serializable metadata dict + """ + info = self._staged_objects[field] + full_path = info["full_path"] + ext = info["ext"] + + # Check if it's a directory (multiple files) or single file + full_remote_path = self._backend._full_path(full_path) + + try: + is_dir = self._backend.fs.isdir(full_remote_path) + except Exception: + is_dir = False + + if is_dir: + # Calculate total size and file count + total_size = 0 + item_count = 0 + files = [] + + for root, dirs, filenames in self._backend.fs.walk(full_remote_path): + for filename in filenames: + file_path = f"{root}/{filename}" + try: + file_size = self._backend.fs.size(file_path) + rel_path = file_path[len(full_remote_path) :].lstrip("/") + files.append({"path": rel_path, "size": file_size}) + total_size += file_size + item_count += 1 + except Exception: + pass + + # Create manifest + manifest = { + "files": files, + "total_size": total_size, + "item_count": item_count, + "created": datetime.now(timezone.utc).isoformat(), + } + + # Write manifest alongside folder + manifest_path = f"{full_path}.manifest.json" + self._backend.put_buffer(json.dumps(manifest, indent=2).encode(), manifest_path) + + metadata = { + "path": info["relative_path"], + "size": total_size, + "hash": None, + "ext": ext, + "is_dir": True, + "timestamp": datetime.now(timezone.utc).isoformat(), + "item_count": item_count, + } + else: + # Single file + try: + size = self._backend.size(full_path) + except Exception: + size = 0 + + metadata = { + "path": info["relative_path"], + "size": size, + "hash": None, + "ext": ext, + "is_dir": False, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + # Add mime_type for files + if ext: + mime_type, _ = mimetypes.guess_type(f"file{ext}") + if mime_type: + metadata["mime_type"] = mime_type + + return metadata + + def _finalize(self): + """ + Finalize the staged insert by computing metadata and inserting the record. + """ + # Process each staged object + for field in list(self._staged_objects.keys()): + metadata = self._compute_metadata(field) + # Store JSON metadata in the record + self._rec[field] = json.dumps(metadata) + + # Insert the record + self._table.insert1(self._rec) + + def _cleanup(self): + """ + Clean up staged objects on failure. + """ + if self._backend is None: + return + + for field, info in self._staged_objects.items(): + full_path = info["full_path"] + try: + # Check if it's a directory + full_remote_path = self._backend._full_path(full_path) + if self._backend.fs.exists(full_remote_path): + if self._backend.fs.isdir(full_remote_path): + self._backend.remove_folder(full_path) + else: + self._backend.remove(full_path) + except Exception: + pass # Best effort cleanup + + +@contextmanager +def staged_insert1(table): + """ + Context manager for staged insert operations. + + Args: + table: The Table instance to insert into + + Yields: + StagedInsert instance for setting record values and getting storage handles + + Example: + with staged_insert1(Recording) as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w') + z[:] = data + staged.rec['raw_data'] = z + """ + staged = StagedInsert(table) + try: + yield staged + staged._finalize() + except Exception: + staged._cleanup() + raise diff --git a/src/datajoint/storage.py b/src/datajoint/storage.py new file mode 100644 index 000000000..325364ea3 --- /dev/null +++ b/src/datajoint/storage.py @@ -0,0 +1,775 @@ +""" +Storage backend abstraction using fsspec for unified file operations. + +This module provides a unified interface for storage operations across different +backends (local filesystem, S3, GCS, Azure, etc.) using the fsspec library. +""" + +import json +import logging +import secrets +import urllib.parse +from datetime import datetime, timezone +from pathlib import Path, PurePosixPath +from typing import Any + +import fsspec + +from . import errors + +logger = logging.getLogger(__name__.split(".")[0]) + +# Characters safe for use in filenames and URLs +TOKEN_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + +# Supported remote URL protocols for copy insert +REMOTE_PROTOCOLS = ("s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://") + + +def is_remote_url(path: str) -> bool: + """ + Check if a path is a remote URL. + + Args: + path: Path string to check + + Returns: + True if path is a remote URL + """ + if not isinstance(path, str): + return False + return path.lower().startswith(REMOTE_PROTOCOLS) + + +def parse_remote_url(url: str) -> tuple[str, str]: + """ + Parse a remote URL into protocol and path. + + Args: + url: Remote URL (e.g., 's3://bucket/path/file.dat') + + Returns: + Tuple of (protocol, path) where protocol is fsspec-compatible + """ + url_lower = url.lower() + + # Map URL schemes to fsspec protocols + protocol_map = { + "s3://": "s3", + "gs://": "gcs", + "gcs://": "gcs", + "az://": "abfs", + "abfs://": "abfs", + "http://": "http", + "https://": "https", + } + + for prefix, protocol in protocol_map.items(): + if url_lower.startswith(prefix): + path = url[len(prefix) :] + return protocol, path + + raise errors.DataJointError(f"Unsupported remote URL protocol: {url}") + + +def generate_token(length: int = 8) -> str: + """ + Generate a random token for filename collision avoidance. + + Args: + length: Token length (4-16 characters, default 8) + + Returns: + Random URL-safe string + """ + length = max(4, min(16, length)) + return "".join(secrets.choice(TOKEN_ALPHABET) for _ in range(length)) + + +def encode_pk_value(value: Any) -> str: + """ + Encode a primary key value for use in storage paths. + + Args: + value: Primary key value (int, str, date, etc.) + + Returns: + Path-safe string representation + """ + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, datetime): + # Use ISO format with safe separators + return value.strftime("%Y-%m-%dT%H-%M-%S") + if hasattr(value, "isoformat"): + # Handle date objects + return value.isoformat() + + # String handling + s = str(value) + # Check if path-safe (no special characters) + unsafe_chars = '/\\:*?"<>|' + if any(c in s for c in unsafe_chars) or len(s) > 100: + # URL-encode unsafe strings or truncate long ones + if len(s) > 100: + # Truncate and add hash suffix for uniqueness + import hashlib + + hash_suffix = hashlib.md5(s.encode()).hexdigest()[:8] + s = s[:50] + "_" + hash_suffix + return urllib.parse.quote(s, safe="") + return s + + +def build_object_path( + schema: str, + table: str, + field: str, + primary_key: dict[str, Any], + ext: str | None, + partition_pattern: str | None = None, + token_length: int = 8, +) -> tuple[str, str]: + """ + Build the storage path for an object attribute. + + Args: + schema: Schema name + table: Table name + field: Field/attribute name + primary_key: Dict of primary key attribute names to values + ext: File extension (e.g., ".dat") or None + partition_pattern: Optional partition pattern with {attr} placeholders + token_length: Length of random token suffix + + Returns: + Tuple of (relative_path, token) + """ + token = generate_token(token_length) + + # Build filename: field_token.ext + filename = f"{field}_{token}" + if ext: + if not ext.startswith("."): + ext = "." + ext + filename += ext + + # Build primary key path components + pk_parts = [] + partition_attrs = set() + + # Extract partition attributes if pattern specified + if partition_pattern: + import re + + partition_attrs = set(re.findall(r"\{(\w+)\}", partition_pattern)) + + # Build partition prefix (attributes specified in partition pattern) + partition_parts = [] + for attr in partition_attrs: + if attr in primary_key: + partition_parts.append(f"{attr}={encode_pk_value(primary_key[attr])}") + + # Build remaining PK path (attributes not in partition) + for attr, value in primary_key.items(): + if attr not in partition_attrs: + pk_parts.append(f"{attr}={encode_pk_value(value)}") + + # Construct full path + # Pattern: {partition_attrs}/{schema}/{table}/objects/{remaining_pk}/{filename} + parts = [] + if partition_parts: + parts.extend(partition_parts) + parts.append(schema) + parts.append(table) + parts.append("objects") + if pk_parts: + parts.extend(pk_parts) + parts.append(filename) + + return "/".join(parts), token + + +class StorageBackend: + """ + Unified storage backend using fsspec. + + Provides a consistent interface for file operations across different storage + backends including local filesystem and cloud object storage (S3, GCS, Azure). + """ + + def __init__(self, spec: dict[str, Any]): + """ + Initialize storage backend from configuration spec. + + Args: + spec: Storage configuration dictionary containing: + - protocol: Storage protocol ('file', 's3', 'gcs', 'azure') + - location: Base path or bucket prefix + - bucket: Bucket name (for cloud storage) + - endpoint: Endpoint URL (for S3-compatible storage) + - access_key: Access key (for cloud storage) + - secret_key: Secret key (for cloud storage) + - secure: Use HTTPS (default: True for cloud) + - Additional protocol-specific options + """ + self.spec = spec + self.protocol = spec.get("protocol", "file") + self._fs = None + self._validate_spec() + + def _validate_spec(self): + """Validate configuration spec for the protocol.""" + if self.protocol == "file": + location = self.spec.get("location") + if location and not Path(location).is_dir(): + raise FileNotFoundError(f"Inaccessible local directory {location}") + elif self.protocol == "s3": + required = ["endpoint", "bucket", "access_key", "secret_key"] + missing = [k for k in required if not self.spec.get(k)] + if missing: + raise errors.DataJointError(f"Missing S3 configuration: {', '.join(missing)}") + + @property + def fs(self) -> fsspec.AbstractFileSystem: + """Get or create the fsspec filesystem instance.""" + if self._fs is None: + self._fs = self._create_filesystem() + return self._fs + + def _create_filesystem(self) -> fsspec.AbstractFileSystem: + """Create fsspec filesystem based on protocol.""" + if self.protocol == "file": + return fsspec.filesystem("file") + + elif self.protocol == "s3": + # Build S3 configuration + endpoint = self.spec["endpoint"] + # Determine if endpoint includes protocol + if not endpoint.startswith(("http://", "https://")): + secure = self.spec.get("secure", False) + endpoint_url = f"{'https' if secure else 'http'}://{endpoint}" + else: + endpoint_url = endpoint + + return fsspec.filesystem( + "s3", + key=self.spec["access_key"], + secret=self.spec["secret_key"], + client_kwargs={"endpoint_url": endpoint_url}, + ) + + elif self.protocol == "gcs": + return fsspec.filesystem( + "gcs", + token=self.spec.get("token"), + project=self.spec.get("project"), + ) + + elif self.protocol == "azure": + return fsspec.filesystem( + "abfs", + account_name=self.spec.get("account_name"), + account_key=self.spec.get("account_key"), + connection_string=self.spec.get("connection_string"), + ) + + else: + raise errors.DataJointError(f"Unsupported storage protocol: {self.protocol}") + + def _full_path(self, path: str | PurePosixPath) -> str: + """ + Construct full path including bucket for cloud storage. + + Args: + path: Relative path within the storage location + + Returns: + Full path suitable for fsspec operations + """ + path = str(path) + if self.protocol == "s3": + bucket = self.spec["bucket"] + return f"{bucket}/{path}" + elif self.protocol in ("gcs", "azure"): + bucket = self.spec.get("bucket") or self.spec.get("container") + return f"{bucket}/{path}" + else: + # Local filesystem - path is already absolute or relative to cwd + return path + + def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, metadata: dict | None = None): + """ + Upload a file from local filesystem to storage. + + Args: + local_path: Path to local file + remote_path: Destination path in storage + metadata: Optional metadata to attach to the file + """ + full_path = self._full_path(remote_path) + logger.debug(f"put_file: {local_path} -> {self.protocol}:{full_path}") + + if self.protocol == "file": + # For local filesystem, use safe copy with atomic rename + from .utils import safe_copy + + Path(full_path).parent.mkdir(parents=True, exist_ok=True) + safe_copy(local_path, full_path, overwrite=True) + else: + # For cloud storage, use fsspec put + self.fs.put_file(str(local_path), full_path) + + def get_file(self, remote_path: str | PurePosixPath, local_path: str | Path): + """ + Download a file from storage to local filesystem. + + Args: + remote_path: Path in storage + local_path: Destination path on local filesystem + """ + full_path = self._full_path(remote_path) + logger.debug(f"get_file: {self.protocol}:{full_path} -> {local_path}") + + local_path = Path(local_path) + local_path.parent.mkdir(parents=True, exist_ok=True) + + if self.protocol == "file": + from .utils import safe_copy + + safe_copy(full_path, local_path) + else: + self.fs.get_file(full_path, str(local_path)) + + def put_buffer(self, buffer: bytes, remote_path: str | PurePosixPath): + """ + Write bytes to storage. + + Args: + buffer: Bytes to write + remote_path: Destination path in storage + """ + full_path = self._full_path(remote_path) + logger.debug(f"put_buffer: {len(buffer)} bytes -> {self.protocol}:{full_path}") + + if self.protocol == "file": + from .utils import safe_write + + Path(full_path).parent.mkdir(parents=True, exist_ok=True) + safe_write(full_path, buffer) + else: + self.fs.pipe_file(full_path, buffer) + + def get_buffer(self, remote_path: str | PurePosixPath) -> bytes: + """ + Read bytes from storage. + + Args: + remote_path: Path in storage + + Returns: + File contents as bytes + """ + full_path = self._full_path(remote_path) + logger.debug(f"get_buffer: {self.protocol}:{full_path}") + + try: + if self.protocol == "file": + return Path(full_path).read_bytes() + else: + return self.fs.cat_file(full_path) + except FileNotFoundError: + raise errors.MissingExternalFile(f"Missing external file {full_path}") from None + + def exists(self, remote_path: str | PurePosixPath) -> bool: + """ + Check if a file exists in storage. + + Args: + remote_path: Path in storage + + Returns: + True if file exists + """ + full_path = self._full_path(remote_path) + logger.debug(f"exists: {self.protocol}:{full_path}") + + if self.protocol == "file": + return Path(full_path).is_file() + else: + return self.fs.exists(full_path) + + def remove(self, remote_path: str | PurePosixPath): + """ + Remove a file from storage. + + Args: + remote_path: Path in storage + """ + full_path = self._full_path(remote_path) + logger.debug(f"remove: {self.protocol}:{full_path}") + + try: + if self.protocol == "file": + Path(full_path).unlink(missing_ok=True) + else: + self.fs.rm(full_path) + except FileNotFoundError: + pass # Already gone + + def size(self, remote_path: str | PurePosixPath) -> int: + """ + Get file size in bytes. + + Args: + remote_path: Path in storage + + Returns: + File size in bytes + """ + full_path = self._full_path(remote_path) + + if self.protocol == "file": + return Path(full_path).stat().st_size + else: + return self.fs.size(full_path) + + def open(self, remote_path: str | PurePosixPath, mode: str = "rb"): + """ + Open a file in storage. + + Args: + remote_path: Path in storage + mode: File mode ('rb', 'wb', etc.) + + Returns: + File-like object + """ + full_path = self._full_path(remote_path) + return self.fs.open(full_path, mode) + + def put_folder(self, local_path: str | Path, remote_path: str | PurePosixPath) -> dict: + """ + Upload a folder to storage. + + Args: + local_path: Path to local folder + remote_path: Destination path in storage + + Returns: + Manifest dict with file list, total_size, and item_count + """ + local_path = Path(local_path) + if not local_path.is_dir(): + raise errors.DataJointError(f"Not a directory: {local_path}") + + full_path = self._full_path(remote_path) + logger.debug(f"put_folder: {local_path} -> {self.protocol}:{full_path}") + + # Collect file info for manifest + files = [] + total_size = 0 + + for root, dirs, filenames in local_path.walk(): + for filename in filenames: + file_path = root / filename + rel_path = file_path.relative_to(local_path).as_posix() + file_size = file_path.stat().st_size + files.append({"path": rel_path, "size": file_size}) + total_size += file_size + + # Upload folder contents + if self.protocol == "file": + import shutil + + dest = Path(full_path) + dest.mkdir(parents=True, exist_ok=True) + for item in local_path.iterdir(): + if item.is_file(): + shutil.copy2(item, dest / item.name) + else: + shutil.copytree(item, dest / item.name, dirs_exist_ok=True) + else: + self.fs.put(str(local_path), full_path, recursive=True) + + # Build manifest + manifest = { + "files": files, + "total_size": total_size, + "item_count": len(files), + "created": datetime.now(timezone.utc).isoformat(), + } + + # Write manifest alongside folder + manifest_path = f"{remote_path}.manifest.json" + self.put_buffer(json.dumps(manifest, indent=2).encode(), manifest_path) + + return manifest + + def remove_folder(self, remote_path: str | PurePosixPath): + """ + Remove a folder and its manifest from storage. + + Args: + remote_path: Path to folder in storage + """ + full_path = self._full_path(remote_path) + logger.debug(f"remove_folder: {self.protocol}:{full_path}") + + try: + if self.protocol == "file": + import shutil + + shutil.rmtree(full_path, ignore_errors=True) + else: + self.fs.rm(full_path, recursive=True) + except FileNotFoundError: + pass + + # Also remove manifest + manifest_path = f"{remote_path}.manifest.json" + self.remove(manifest_path) + + def get_fsmap(self, remote_path: str | PurePosixPath) -> fsspec.FSMap: + """ + Get an FSMap for a path (useful for Zarr/xarray). + + Args: + remote_path: Path in storage + + Returns: + fsspec.FSMap instance + """ + full_path = self._full_path(remote_path) + return fsspec.FSMap(full_path, self.fs) + + def copy_from_url(self, source_url: str, dest_path: str | PurePosixPath) -> int: + """ + Copy a file from a remote URL to managed storage. + + Args: + source_url: Remote URL (s3://, gs://, http://, etc.) + dest_path: Destination path in managed storage + + Returns: + Size of copied file in bytes + """ + protocol, source_path = parse_remote_url(source_url) + full_dest = self._full_path(dest_path) + + logger.debug(f"copy_from_url: {protocol}://{source_path} -> {self.protocol}:{full_dest}") + + # Get source filesystem + source_fs = fsspec.filesystem(protocol) + + # Check if source is a directory + if source_fs.isdir(source_path): + return self._copy_folder_from_url(source_fs, source_path, dest_path) + + # Copy single file + if self.protocol == "file": + # Download to local destination + Path(full_dest).parent.mkdir(parents=True, exist_ok=True) + source_fs.get_file(source_path, full_dest) + return Path(full_dest).stat().st_size + else: + # Remote-to-remote copy via streaming + with source_fs.open(source_path, "rb") as src: + content = src.read() + self.fs.pipe_file(full_dest, content) + return len(content) + + def _copy_folder_from_url( + self, source_fs: fsspec.AbstractFileSystem, source_path: str, dest_path: str | PurePosixPath + ) -> dict: + """ + Copy a folder from a remote URL to managed storage. + + Args: + source_fs: Source filesystem + source_path: Path in source filesystem + dest_path: Destination path in managed storage + + Returns: + Manifest dict with file list, total_size, and item_count + """ + full_dest = self._full_path(dest_path) + logger.debug(f"copy_folder_from_url: {source_path} -> {self.protocol}:{full_dest}") + + # Collect file info for manifest + files = [] + total_size = 0 + + # Walk source directory + for root, dirs, filenames in source_fs.walk(source_path): + for filename in filenames: + src_file = f"{root}/{filename}" if root != source_path else f"{source_path}/{filename}" + rel_path = src_file[len(source_path) :].lstrip("/") + file_size = source_fs.size(src_file) + files.append({"path": rel_path, "size": file_size}) + total_size += file_size + + # Copy file + dest_file = f"{full_dest}/{rel_path}" + if self.protocol == "file": + Path(dest_file).parent.mkdir(parents=True, exist_ok=True) + source_fs.get_file(src_file, dest_file) + else: + with source_fs.open(src_file, "rb") as src: + content = src.read() + self.fs.pipe_file(dest_file, content) + + # Build manifest + manifest = { + "files": files, + "total_size": total_size, + "item_count": len(files), + "created": datetime.now(timezone.utc).isoformat(), + } + + # Write manifest alongside folder + manifest_path = f"{dest_path}.manifest.json" + self.put_buffer(json.dumps(manifest, indent=2).encode(), manifest_path) + + return manifest + + def source_is_directory(self, source: str) -> bool: + """ + Check if a source path (local or remote URL) is a directory. + + Args: + source: Local path or remote URL + + Returns: + True if source is a directory + """ + if is_remote_url(source): + protocol, path = parse_remote_url(source) + source_fs = fsspec.filesystem(protocol) + return source_fs.isdir(path) + else: + return Path(source).is_dir() + + def source_exists(self, source: str) -> bool: + """ + Check if a source path (local or remote URL) exists. + + Args: + source: Local path or remote URL + + Returns: + True if source exists + """ + if is_remote_url(source): + protocol, path = parse_remote_url(source) + source_fs = fsspec.filesystem(protocol) + return source_fs.exists(path) + else: + return Path(source).exists() + + def get_source_size(self, source: str) -> int | None: + """ + Get the size of a source file (local or remote URL). + + Args: + source: Local path or remote URL + + Returns: + Size in bytes, or None if directory or cannot determine + """ + try: + if is_remote_url(source): + protocol, path = parse_remote_url(source) + source_fs = fsspec.filesystem(protocol) + if source_fs.isdir(path): + return None + return source_fs.size(path) + else: + p = Path(source) + if p.is_dir(): + return None + return p.stat().st_size + except Exception: + return None + + +STORE_METADATA_FILENAME = "datajoint_store.json" + + +def get_storage_backend(spec: dict[str, Any]) -> StorageBackend: + """ + Factory function to create a storage backend from configuration. + + Args: + spec: Storage configuration dictionary + + Returns: + StorageBackend instance + """ + return StorageBackend(spec) + + +def verify_or_create_store_metadata(backend: StorageBackend, spec: dict[str, Any]) -> dict: + """ + Verify or create the store metadata file at the storage root. + + On first use, creates the datajoint_store.json file with project info. + On subsequent uses, verifies the project_name matches. + + Args: + backend: StorageBackend instance + spec: Object storage configuration spec + + Returns: + Store metadata dict + + Raises: + DataJointError: If project_name mismatch detected + """ + from .version import __version__ as dj_version + + project_name = spec.get("project_name") + location = spec.get("location", "") + + # Metadata file path at storage root + metadata_path = f"{location}/{STORE_METADATA_FILENAME}" if location else STORE_METADATA_FILENAME + + try: + # Try to read existing metadata + if backend.exists(metadata_path): + metadata_content = backend.get_buffer(metadata_path) + metadata = json.loads(metadata_content) + + # Verify project_name matches + store_project = metadata.get("project_name") + if store_project and store_project != project_name: + raise errors.DataJointError( + f"Object store project name mismatch.\n" + f' Client configured: "{project_name}"\n' + f' Store metadata: "{store_project}"\n' + f"Ensure all clients use the same object_storage.project_name setting." + ) + + return metadata + else: + # Create new metadata + metadata = { + "project_name": project_name, + "created": datetime.now(timezone.utc).isoformat(), + "format_version": "1.0", + "datajoint_version": dj_version, + } + + # Optional database info - not enforced, just informational + # These would need to be passed in from the connection context + # For now, omit them + + backend.put_buffer(json.dumps(metadata, indent=2).encode(), metadata_path) + return metadata + + except errors.DataJointError: + raise + except Exception as e: + # Log warning but don't fail - metadata is informational + logger.warning(f"Could not verify/create store metadata: {e}") + return {"project_name": project_name} diff --git a/src/datajoint/table.py b/src/datajoint/table.py index a8a52c3e0..d94dfd66f 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -4,15 +4,16 @@ import itertools import json import logging +import mimetypes import platform import re import uuid +from datetime import datetime, timezone from pathlib import Path import numpy as np import pandas -from . import blob from .condition import make_condition from .declare import alter, declare from .errors import ( @@ -25,6 +26,8 @@ from .expression import QueryExpression from .heading import Heading from .settings import config +from .staged_insert import staged_insert1 as _staged_insert1 +from .storage import StorageBackend, build_object_path, verify_or_create_store_metadata from .utils import get_master, is_camel_case, user_choice from .version import __version__ as version @@ -269,6 +272,125 @@ def _log(self): def external(self): return self.connection.schemas[self.database].external + @property + def object_storage(self) -> StorageBackend | None: + """Get the object storage backend for this table.""" + if not hasattr(self, "_object_storage"): + try: + spec = config.get_object_storage_spec() + self._object_storage = StorageBackend(spec) + # Verify/create store metadata on first use + verify_or_create_store_metadata(self._object_storage, spec) + except DataJointError: + self._object_storage = None + return self._object_storage + + def _process_object_value(self, name: str, value, row: dict) -> str: + """ + Process an object attribute value for insert. + + Args: + name: Attribute name + value: Input value (file path, folder path, or (ext, stream) tuple) + row: The full row dict (needed for primary key values) + + Returns: + JSON string for database storage + """ + if self.object_storage is None: + raise DataJointError( + "Object storage is not configured. Set object_storage settings in datajoint.json " + "or DJ_OBJECT_STORAGE_* environment variables." + ) + + # Extract primary key values from row + primary_key = {k: row[k] for k in self.primary_key if k in row} + if not primary_key: + raise DataJointError("Primary key values must be provided before object attributes for insert.") + + # Determine input type and extract extension + is_dir = False + ext = None + size = 0 + source_path = None + stream = None + + if isinstance(value, tuple) and len(value) == 2: + # Tuple of (ext, stream) + ext, stream = value + if hasattr(stream, "read"): + # Read stream to buffer for upload + content = stream.read() + size = len(content) + else: + raise DataJointError(f"Invalid stream object for attribute {name}") + elif isinstance(value, (str, Path)): + source_path = Path(value) + if not source_path.exists(): + raise DataJointError(f"File or folder not found: {source_path}") + is_dir = source_path.is_dir() + if not is_dir: + ext = source_path.suffix or None + size = source_path.stat().st_size + else: + raise DataJointError( + f"Invalid value type for object attribute {name}. " "Expected file path, folder path, or (ext, stream) tuple." + ) + + # Get storage spec for path building + spec = config.get_object_storage_spec() + partition_pattern = spec.get("partition_pattern") + token_length = spec.get("token_length", 8) + location = spec.get("location", "") + + # Build storage path + relative_path, token = build_object_path( + schema=self.database, + table=self.class_name, + field=name, + primary_key=primary_key, + ext=ext, + partition_pattern=partition_pattern, + token_length=token_length, + ) + + # Prepend location if specified + full_storage_path = f"{location}/{relative_path}" if location else relative_path + + # Upload content + manifest = None + if source_path: + if is_dir: + manifest = self.object_storage.put_folder(source_path, full_storage_path) + size = manifest["total_size"] + else: + self.object_storage.put_file(source_path, full_storage_path) + elif stream: + self.object_storage.put_buffer(content, full_storage_path) + + # Build JSON metadata + timestamp = datetime.now(timezone.utc).isoformat() + metadata = { + "path": relative_path, + "size": size, + "hash": None, # Hash is optional, not computed by default + "ext": ext, + "is_dir": is_dir, + "timestamp": timestamp, + } + + # Add mime_type for files + if not is_dir and ext: + mime_type, _ = mimetypes.guess_type(f"file{ext}") + if mime_type: + metadata["mime_type"] = mime_type + + # Add item_count for folders + if is_dir and manifest: + metadata["item_count"] = manifest["item_count"] + + return json.dumps(metadata) + def update1(self, row): """ ``update1`` updates one existing entry in the table. @@ -320,6 +442,35 @@ def insert1(self, row, **kwargs): """ self.insert((row,), **kwargs) + @property + def staged_insert1(self): + """ + Context manager for staged insert with direct object storage writes. + + Use this for large objects like Zarr arrays where copying from local storage + is inefficient. Allows writing directly to the destination storage before + finalizing the database insert. + + Example: + with table.staged_insert1 as staged: + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # Create object storage directly + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) + z[:] = data + + # Assign to record + staged.rec['raw_data'] = z + + # On successful exit: metadata computed, record inserted + # On exception: storage cleaned up, no record inserted + + Yields: + StagedInsert: Context for setting record values and getting storage handles + """ + return _staged_insert1(self) + def insert( self, rows, @@ -713,7 +864,7 @@ def describe(self, context=None, printout=False): return definition # --- private helper functions ---- - def __make_placeholder(self, name, value, ignore_extra_fields=False): + def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): """ For a given attribute `name` with `value`, return its processed value or value placeholder as a string to be included in the query and the value, if any, to be submitted for @@ -721,12 +872,16 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): :param name: name of attribute to be inserted :param value: value of attribute to be inserted + :param ignore_extra_fields: if True, return None for unknown fields + :param row: the full row dict (needed for object attributes to extract primary key) """ if ignore_extra_fields and name not in self.heading: return None attr = self.heading[name] if attr.adapter: - value = attr.adapter.put(value) + # Custom attribute type: validate and encode + attr.adapter.validate(value) + value = attr.adapter.encode(value, key=None) if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): # set default value placeholder, value = "DEFAULT", None @@ -740,8 +895,10 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name)) value = value.bytes elif attr.is_blob: - value = blob.pack(value) - value = self.external[attr.store].put(value).bytes if attr.is_external else value + # Adapters (like ) handle serialization in encode() + # Without adapter, blob columns store raw bytes (no serialization) + if attr.is_external: + value = self.external[attr.store].put(value).bytes elif attr.is_attachment: attachment_path = Path(value) if attr.is_external: @@ -752,6 +909,13 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): value = str.encode(attachment_path.name) + b"\0" + attachment_path.read_bytes() elif attr.is_filepath: value = self.external[attr.store].upload_filepath(value).bytes + elif attr.is_object: + # Object type - upload to object storage and return JSON metadata + if row is None: + raise DataJointError( + f"Object attribute {name} requires full row context for insert. " "This is an internal error." + ) + value = self._process_object_value(name, value, row) elif attr.numeric: value = str(int(value) if isinstance(value, bool) else value) elif attr.json: @@ -780,17 +944,23 @@ def check_fields(fields): elif set(field_list) != set(fields).intersection(self.heading.names): raise DataJointError("Attempt to insert rows with different fields.") + # Convert row to dict for object attribute processing + row_dict = None if isinstance(row, np.void): # np.array check_fields(row.dtype.fields) + row_dict = {name: row[name] for name in row.dtype.fields} attributes = [ - self.__make_placeholder(name, row[name], ignore_extra_fields) + self.__make_placeholder(name, row[name], ignore_extra_fields, row=row_dict) for name in self.heading if name in row.dtype.fields ] elif isinstance(row, collections.abc.Mapping): # dict-based check_fields(row) + row_dict = dict(row) attributes = [ - self.__make_placeholder(name, row[name], ignore_extra_fields) for name in self.heading if name in row + self.__make_placeholder(name, row[name], ignore_extra_fields, row=row_dict) + for name in self.heading + if name in row ] else: # positional try: @@ -803,8 +973,10 @@ def check_fields(fields): except TypeError: raise DataJointError("Datatype %s cannot be inserted" % type(row)) else: + row_dict = dict(zip(self.heading.names, row)) attributes = [ - self.__make_placeholder(name, value, ignore_extra_fields) for name, value in zip(self.heading, row) + self.__make_placeholder(name, value, ignore_extra_fields, row=row_dict) + for name, value in zip(self.heading, row) ] if ignore_extra_fields: attributes = [a for a in attributes if a is not None] diff --git a/src/datajoint/user_tables.py b/src/datajoint/user_tables.py index d7faeb285..59065e7f1 100644 --- a/src/datajoint/user_tables.py +++ b/src/datajoint/user_tables.py @@ -152,6 +152,15 @@ class Imported(UserTable, AutoPopulate): _prefix = "_" tier_regexp = r"(?P" + _prefix + _base_regexp + ")" + def drop_quick(self): + """ + Drop the table and its associated jobs table. + """ + # Drop the jobs table first if it exists + if self._jobs_table is not None and self._jobs_table.is_declared: + self._jobs_table.drop_quick() + super().drop_quick() + class Computed(UserTable, AutoPopulate): """ @@ -162,6 +171,15 @@ class Computed(UserTable, AutoPopulate): _prefix = "__" tier_regexp = r"(?P" + _prefix + _base_regexp + ")" + def drop_quick(self): + """ + Drop the table and its associated jobs table. + """ + # Drop the jobs table first if it exists + if self._jobs_table is not None and self._jobs_table.is_declared: + self._jobs_table.drop_quick() + super().drop_quick() + class Part(UserTable): """ diff --git a/tests/conftest.py b/tests/conftest.py index 8a6ba4057..23222f43a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,12 +16,10 @@ import datajoint as dj from datajoint.errors import ( - ADAPTED_TYPE_SWITCH, FILEPATH_FEATURE_SWITCH, - DataJointError, ) -from . import schema, schema_advanced, schema_external, schema_simple +from . import schema, schema_advanced, schema_external, schema_object, schema_simple from . import schema_uuid as schema_uuid_module from . import schema_type_aliases as schema_type_aliases_module @@ -56,21 +54,6 @@ def clean_autopopulate(experiment, trial, ephys): experiment.delete() -@pytest.fixture -def clean_jobs(schema_any): - """ - Explicit cleanup fixture for jobs tests. - - Cleans jobs table before test runs. - Tests must explicitly request this fixture to get cleanup. - """ - try: - schema_any.jobs.delete() - except DataJointError: - pass - yield - - @pytest.fixture def clean_test_tables(test, test_extra, test_no_extra): """ @@ -334,10 +317,14 @@ def monkeymodule(): @pytest.fixture -def enable_adapted_types(monkeypatch): - monkeypatch.setenv(ADAPTED_TYPE_SWITCH, "TRUE") +def enable_adapted_types(): + """ + Deprecated fixture - custom attribute types no longer require a feature flag. + + This fixture is kept for backward compatibility but does nothing. + Custom types are now enabled by default via the AttributeType system. + """ yield - monkeypatch.delenv(ADAPTED_TYPE_SWITCH, raising=True) @pytest.fixture @@ -566,10 +553,6 @@ def mock_cache(tmpdir_factory): def schema_any(connection_test, prefix): schema_any = dj.Schema(prefix + "_test1", schema.LOCALS_ANY, connection=connection_test) assert schema.LOCALS_ANY, "LOCALS_ANY is empty" - try: - schema_any.jobs.delete() - except DataJointError: - pass schema_any(schema.TTest) schema_any(schema.TTest2) schema_any(schema.TTest3) @@ -609,10 +592,6 @@ def schema_any(connection_test, prefix): schema_any(schema.Stimulus) schema_any(schema.Longblob) yield schema_any - try: - schema_any.jobs.delete() - except DataJointError: - pass schema_any.drop() @@ -621,10 +600,6 @@ def schema_any_fresh(connection_test, prefix): """Function-scoped schema_any for tests that need fresh schema state.""" schema_any = dj.Schema(prefix + "_test1_fresh", schema.LOCALS_ANY, connection=connection_test) assert schema.LOCALS_ANY, "LOCALS_ANY is empty" - try: - schema_any.jobs.delete() - except DataJointError: - pass schema_any(schema.TTest) schema_any(schema.TTest2) schema_any(schema.TTest3) @@ -664,10 +639,6 @@ def schema_any_fresh(connection_test, prefix): schema_any(schema.Stimulus) schema_any(schema.Longblob) yield schema_any - try: - schema_any.jobs.delete() - except DataJointError: - pass schema_any.drop() @@ -903,3 +874,67 @@ def channel(schema_any): @pytest.fixture def trash(schema_any): return schema.UberTrash() + + +# Object storage fixtures + + +@pytest.fixture +def object_storage_config(tmpdir_factory): + """Create object storage configuration for testing.""" + location = str(tmpdir_factory.mktemp("object_storage")) + return { + "project_name": "test_project", + "protocol": "file", + "location": location, + "token_length": 8, + } + + +@pytest.fixture +def mock_object_storage(object_storage_config, monkeypatch): + """Mock object storage configuration in datajoint config.""" + # Store original config + original_object_storage = getattr(dj.config, "_object_storage", None) + + # Create a mock ObjectStorageSettings-like object + class MockObjectStorageSettings: + def __init__(self, config): + self.project_name = config["project_name"] + self.protocol = config["protocol"] + self.location = config["location"] + self.token_length = config.get("token_length", 8) + self.partition_pattern = config.get("partition_pattern") + self.bucket = config.get("bucket") + self.endpoint = config.get("endpoint") + self.access_key = config.get("access_key") + self.secret_key = config.get("secret_key") + self.secure = config.get("secure", True) + self.container = config.get("container") + + mock_settings = MockObjectStorageSettings(object_storage_config) + + # Patch the object_storage attribute + monkeypatch.setattr(dj.config, "object_storage", mock_settings) + + yield object_storage_config + + # Restore original + if original_object_storage is not None: + monkeypatch.setattr(dj.config, "object_storage", original_object_storage) + + +@pytest.fixture +def schema_obj(connection_test, prefix, mock_object_storage): + """Schema for object type tests.""" + schema = dj.Schema( + prefix + "_object", + context=schema_object.LOCALS_OBJECT, + connection=connection_test, + ) + schema(schema_object.ObjectFile) + schema(schema_object.ObjectFolder) + schema(schema_object.ObjectMultiple) + schema(schema_object.ObjectWithOther) + yield schema + schema.drop() diff --git a/tests/schema_object.py b/tests/schema_object.py new file mode 100644 index 000000000..fe5215a37 --- /dev/null +++ b/tests/schema_object.py @@ -0,0 +1,51 @@ +""" +Schema definitions for object type tests. +""" + +import datajoint as dj + +LOCALS_OBJECT = locals() + + +class ObjectFile(dj.Manual): + """Table for testing object type with files.""" + + definition = """ + file_id : int + --- + data_file : object # stored file + """ + + +class ObjectFolder(dj.Manual): + """Table for testing object type with folders.""" + + definition = """ + folder_id : int + --- + data_folder : object # stored folder + """ + + +class ObjectMultiple(dj.Manual): + """Table for testing multiple object attributes.""" + + definition = """ + record_id : int + --- + raw_data : object # raw data file + processed : object # processed data file + """ + + +class ObjectWithOther(dj.Manual): + """Table for testing object type with other attributes.""" + + definition = """ + subject_id : int + session_id : int + --- + name : varchar(100) + data_file : object + notes : varchar(255) + """ diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py index 1060a50ed..0b4285ffb 100644 --- a/tests/test_adapted_attributes.py +++ b/tests/test_adapted_attributes.py @@ -1,3 +1,10 @@ +""" +Tests for adapted/custom attribute types. + +These tests use the legacy AttributeAdapter API for backward compatibility testing. +""" + +import warnings from itertools import zip_longest import networkx as nx @@ -8,6 +15,9 @@ from . import schema_adapted from .schema_adapted import Connectivity, Layout +# Filter deprecation warnings from legacy AttributeAdapter usage in these tests +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + @pytest.fixture def schema_name(prefix): @@ -16,24 +26,28 @@ def schema_name(prefix): @pytest.fixture def adapted_graph_instance(): - yield schema_adapted.GraphAdapter() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + yield schema_adapted.GraphAdapter() @pytest.fixture def schema_ad( connection_test, adapted_graph_instance, - enable_adapted_types, enable_filepath_feature, s3_creds, tmpdir, schema_name, ): dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + layout_adapter = schema_adapted.LayoutToFilepath() context = { **schema_adapted.LOCALS_ADAPTED, "graph": adapted_graph_instance, - "layout_to_filepath": schema_adapted.LayoutToFilepath(), + "layout_to_filepath": layout_adapter, } schema = dj.schema(schema_name, context=context, connection=connection_test) schema(schema_adapted.Connectivity) @@ -92,7 +106,7 @@ def test_adapted_filepath_type(schema_ad, minio_client): c.delete() -def test_adapted_spawned(local_schema, enable_adapted_types): +def test_adapted_spawned(local_schema): c = Connectivity() # a spawned class graphs = [ nx.lollipop_graph(4, 2), diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py new file mode 100644 index 000000000..f8f822a60 --- /dev/null +++ b/tests/test_attribute_type.py @@ -0,0 +1,419 @@ +""" +Tests for the new AttributeType system. +""" + +import pytest + +import datajoint as dj +from datajoint.attribute_type import ( + AttributeType, + _type_registry, + get_type, + is_type_registered, + list_types, + register_type, + resolve_dtype, + unregister_type, +) +from datajoint.errors import DataJointError + + +class TestAttributeTypeRegistry: + """Tests for the type registry functionality.""" + + def setup_method(self): + """Clear any test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_register_type_decorator(self): + """Test registering a type using the decorator.""" + + @register_type + class TestType(AttributeType): + type_name = "test_decorator" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_decorator") + assert get_type("test_decorator").type_name == "test_decorator" + + def test_register_type_direct(self): + """Test registering a type by calling register_type directly.""" + + class TestType(AttributeType): + type_name = "test_direct" + dtype = "varchar(255)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + register_type(TestType) + assert is_type_registered("test_direct") + + def test_register_type_idempotent(self): + """Test that registering the same type twice is idempotent.""" + + @register_type + class TestType(AttributeType): + type_name = "test_idempotent" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + # Second registration should not raise + register_type(TestType) + assert is_type_registered("test_idempotent") + + def test_register_duplicate_name_different_class(self): + """Test that registering different classes with same name raises error.""" + + @register_type + class TestType1(AttributeType): + type_name = "test_duplicate" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestType2(AttributeType): + type_name = "test_duplicate" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="already registered"): + register_type(TestType2) + + def test_unregister_type(self): + """Test unregistering a type.""" + + @register_type + class TestType(AttributeType): + type_name = "test_unregister" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_unregister") + unregister_type("test_unregister") + assert not is_type_registered("test_unregister") + + def test_get_type_not_found(self): + """Test that getting an unregistered type raises error.""" + with pytest.raises(DataJointError, match="Unknown attribute type"): + get_type("nonexistent_type") + + def test_list_types(self): + """Test listing registered types.""" + + @register_type + class TestType(AttributeType): + type_name = "test_list" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + types = list_types() + assert "test_list" in types + assert types == sorted(types) # Should be sorted + + def test_get_type_strips_brackets(self): + """Test that get_type accepts names with or without angle brackets.""" + + @register_type + class TestType(AttributeType): + type_name = "test_brackets" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert get_type("test_brackets") is get_type("") + + +class TestAttributeTypeValidation: + """Tests for the validate method.""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_validate_called_default(self): + """Test that default validate accepts any value.""" + + @register_type + class TestType(AttributeType): + type_name = "test_validate_default" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + t = get_type("test_validate_default") + # Default validate should not raise for any value + t.validate(None) + t.validate(42) + t.validate("string") + t.validate([1, 2, 3]) + + def test_validate_custom(self): + """Test custom validation logic.""" + + @register_type + class PositiveIntType(AttributeType): + type_name = "test_positive_int" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + def validate(self, value): + if not isinstance(value, int): + raise TypeError(f"Expected int, got {type(value).__name__}") + if value < 0: + raise ValueError("Value must be positive") + + t = get_type("test_positive_int") + t.validate(42) # Should pass + + with pytest.raises(TypeError): + t.validate("not an int") + + with pytest.raises(ValueError): + t.validate(-1) + + +class TestTypeChaining: + """Tests for type chaining (dtype referencing another custom type).""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_resolve_native_dtype(self): + """Test resolving a native dtype.""" + final_dtype, chain = resolve_dtype("longblob") + assert final_dtype == "longblob" + assert chain == [] + + def test_resolve_custom_dtype(self): + """Test resolving a custom dtype.""" + + @register_type + class TestType(AttributeType): + type_name = "test_resolve" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].type_name == "test_resolve" + + def test_resolve_chained_dtype(self): + """Test resolving a chained dtype.""" + + @register_type + class InnerType(AttributeType): + type_name = "test_inner" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class OuterType(AttributeType): + type_name = "test_outer" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "longblob" + assert len(chain) == 2 + assert chain[0].type_name == "test_outer" + assert chain[1].type_name == "test_inner" + + def test_circular_reference_detection(self): + """Test that circular type references are detected.""" + + @register_type + class TypeA(AttributeType): + type_name = "test_circular_a" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TypeB(AttributeType): + type_name = "test_circular_b" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="Circular type reference"): + resolve_dtype("") + + +class TestExportsAndAPI: + """Test that the public API is properly exported.""" + + def test_exports_from_datajoint(self): + """Test that AttributeType and helpers are exported from datajoint.""" + assert hasattr(dj, "AttributeType") + assert hasattr(dj, "register_type") + assert hasattr(dj, "list_types") + + def test_attribute_adapter_deprecated(self): + """Test that AttributeAdapter is still available but deprecated.""" + assert hasattr(dj, "AttributeAdapter") + # AttributeAdapter should be a subclass of AttributeType + assert issubclass(dj.AttributeAdapter, dj.AttributeType) + + +class TestDJBlobType: + """Tests for the built-in DJBlobType.""" + + def test_djblob_is_registered(self): + """Test that djblob is automatically registered.""" + assert is_type_registered("djblob") + + def test_djblob_properties(self): + """Test DJBlobType properties.""" + blob_type = get_type("djblob") + assert blob_type.type_name == "djblob" + assert blob_type.dtype == "longblob" + + def test_djblob_encode_decode_roundtrip(self): + """Test that encode/decode is a proper roundtrip.""" + import numpy as np + + blob_type = get_type("djblob") + + # Test with various data types + test_data = [ + {"key": "value", "number": 42}, + [1, 2, 3, 4, 5], + np.array([1.0, 2.0, 3.0]), + "simple string", + (1, 2, 3), + None, + ] + + for original in test_data: + encoded = blob_type.encode(original) + assert isinstance(encoded, bytes) + decoded = blob_type.decode(encoded) + if isinstance(original, np.ndarray): + np.testing.assert_array_equal(decoded, original) + else: + assert decoded == original + + def test_djblob_encode_produces_valid_blob_format(self): + """Test that encoded data has valid blob protocol header.""" + blob_type = get_type("djblob") + encoded = blob_type.encode({"test": "data"}) + + # Should start with compression prefix or protocol header + valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") + assert any(encoded.startswith(p) for p in valid_prefixes) + + def test_djblob_in_list_types(self): + """Test that djblob appears in list_types.""" + types = list_types() + assert "djblob" in types + + def test_djblob_handles_serialization(self): + """Test that DJBlobType handles serialization internally. + + With the new design: + - Plain longblob columns store/return raw bytes (no serialization) + - handles pack/unpack in encode/decode + - Legacy AttributeAdapter handles pack/unpack internally for backward compat + """ + blob_type = get_type("djblob") + + # DJBlobType.encode() should produce packed bytes + data = {"key": "value"} + encoded = blob_type.encode(data) + assert isinstance(encoded, bytes) + + # DJBlobType.decode() should unpack back to original + decoded = blob_type.decode(encoded) + assert decoded == data diff --git a/tests/test_autopopulate.py b/tests/test_autopopulate.py index b22b252ee..1f1d33a84 100644 --- a/tests/test_autopopulate.py +++ b/tests/test_autopopulate.py @@ -61,17 +61,22 @@ def test_populate_key_list(clean_autopopulate, subject, experiment, trial): assert n == ret["success_count"] -def test_populate_exclude_error_and_ignore_jobs(clean_autopopulate, schema_any, subject, experiment): +def test_populate_exclude_error_and_ignore_jobs(clean_autopopulate, subject, experiment): # test simple populate assert subject, "root tables are empty" assert not experiment, "table already filled?" + # Ensure jobs table is set up by refreshing + jobs = experiment.jobs + jobs.refresh() + keys = experiment.key_source.fetch("KEY", limit=2) for idx, key in enumerate(keys): if idx == 0: - schema_any.jobs.ignore(experiment.table_name, key) + jobs.ignore(key) else: - schema_any.jobs.error(experiment.table_name, key, "") + jobs.reserve(key) + jobs.error(key, error_message="Test error") experiment.populate(reserve_jobs=True) assert len(experiment.key_source & experiment) == len(experiment.key_source) - 2 diff --git a/tests/test_jobs.py b/tests/test_jobs.py index 4ffc431fe..1925eb4b5 100644 --- a/tests/test_jobs.py +++ b/tests/test_jobs.py @@ -1,130 +1,398 @@ +""" +Tests for the Autopopulate 2.0 per-table jobs system. +""" + import random import string - import datajoint as dj -from datajoint.jobs import ERROR_MESSAGE_LENGTH, TRUNCATION_APPENDIX +from datajoint.jobs import JobsTable, ERROR_MESSAGE_LENGTH, TRUNCATION_APPENDIX from . import schema -def test_reserve_job(clean_jobs, subject, schema_any): - assert subject - table_name = "fake_table" +class TestJobsTableStructure: + """Tests for JobsTable structure and initialization.""" + + def test_jobs_property_exists(self, schema_any): + """Test that Computed tables have a jobs property.""" + assert hasattr(schema.SigIntTable, "jobs") + jobs = schema.SigIntTable().jobs + assert isinstance(jobs, JobsTable) + + def test_jobs_table_name(self, schema_any): + """Test that jobs table has correct naming convention.""" + jobs = schema.SigIntTable().jobs + # SigIntTable is __sig_int_table, jobs should be ~sig_int_table__jobs + assert jobs.table_name.startswith("~") + assert jobs.table_name.endswith("__jobs") + + def test_jobs_table_primary_key(self, schema_any): + """Test that jobs table has FK-derived primary key.""" + jobs = schema.SigIntTable().jobs + # SigIntTable depends on SimpleSource with pk 'id' + assert "id" in jobs.primary_key + + def test_jobs_table_status_column(self, schema_any): + """Test that jobs table has status column with correct enum values.""" + jobs = schema.SigIntTable().jobs + jobs._ensure_declared() + status_attr = jobs.heading.attributes["status"] + assert "pending" in status_attr.type + assert "reserved" in status_attr.type + assert "success" in status_attr.type + assert "error" in status_attr.type + assert "ignore" in status_attr.type + + +class TestJobsRefresh: + """Tests for JobsTable.refresh() method.""" + + def test_refresh_adds_jobs(self, schema_any): + """Test that refresh() adds pending jobs for keys in key_source.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() # Clear any existing jobs + + result = jobs.refresh() + assert result["added"] > 0 + assert len(jobs.pending) > 0 + + def test_refresh_with_priority(self, schema_any): + """Test that refresh() sets priority on new jobs.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + + jobs.refresh(priority=3) + priorities = jobs.pending.fetch("priority") + assert all(p == 3 for p in priorities) + + def test_refresh_with_delay(self, schema_any): + """Test that refresh() sets scheduled_time in the future.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + + jobs.refresh(delay=3600) # 1 hour delay + # Jobs should not be available for processing yet + keys = jobs.fetch_pending() + assert len(keys) == 0 # All jobs are scheduled for later + + def test_refresh_removes_stale_jobs(self, schema_any): + """Test that refresh() removes jobs for deleted upstream records.""" + # This test requires manipulating upstream data + pass # Skip for now + + +class TestJobsReserve: + """Tests for JobsTable.reserve() method.""" + + def test_reserve_pending_job(self, schema_any): + """Test that reserve() transitions pending -> reserved.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + # Get first pending job + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + + # Verify status changed + status = (jobs & key).fetch1("status") + assert status == "reserved" + + def test_reserve_sets_metadata(self, schema_any): + """Test that reserve() sets user, host, pid, connection_id.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + + # Verify metadata was set + row = (jobs & key).fetch1() + assert row["status"] == "reserved" + assert row["reserved_time"] is not None + assert row["user"] != "" + assert row["host"] != "" + assert row["pid"] > 0 + assert row["connection_id"] > 0 + + +class TestJobsComplete: + """Tests for JobsTable.complete() method.""" + + def test_complete_with_keep_false(self, schema_any): + """Test that complete() deletes job when keep=False.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.complete(key, duration=1.5, keep=False) + + assert key not in jobs + + def test_complete_with_keep_true(self, schema_any): + """Test that complete() marks job as success when keep=True.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.complete(key, duration=1.5, keep=True) + + status = (jobs & key).fetch1("status") + assert status == "success" + + +class TestJobsError: + """Tests for JobsTable.error() method.""" + + def test_error_marks_status(self, schema_any): + """Test that error() marks job as error with message.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.error(key, error_message="Test error", error_stack="stack trace") + + status, msg = (jobs & key).fetch1("status", "error_message") + assert status == "error" + assert msg == "Test error" + + def test_error_truncates_long_message(self, schema_any): + """Test that error() truncates long error messages.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() - # reserve jobs - for key in subject.fetch("KEY"): - assert schema_any.jobs.reserve(table_name, key), "failed to reserve a job" + long_message = "".join(random.choice(string.ascii_letters) for _ in range(ERROR_MESSAGE_LENGTH + 100)) + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.error(key, error_message=long_message) + + msg = (jobs & key).fetch1("error_message") + assert len(msg) == ERROR_MESSAGE_LENGTH + assert msg.endswith(TRUNCATION_APPENDIX) + + +class TestJobsIgnore: + """Tests for JobsTable.ignore() method.""" + + def test_ignore_marks_status(self, schema_any): + """Test that ignore() marks job as ignore.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.ignore(key) + + status = (jobs & key).fetch1("status") + assert status == "ignore" + + def test_ignore_new_key(self, schema_any): + """Test that ignore() can create new job with ignore status.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() - # refuse jobs - for key in subject.fetch("KEY"): - assert not schema_any.jobs.reserve(table_name, key), "failed to respect reservation" + # Don't refresh - ignore a key directly + key = {"id": 1} + jobs.ignore(key) - # complete jobs - for key in subject.fetch("KEY"): - schema_any.jobs.complete(table_name, key) - assert not schema_any.jobs, "failed to free jobs" + status = (jobs & key).fetch1("status") + assert status == "ignore" - # reserve jobs again - for key in subject.fetch("KEY"): - assert schema_any.jobs.reserve(table_name, key), "failed to reserve new jobs" - # finish with error - for key in subject.fetch("KEY"): - schema_any.jobs.error(table_name, key, "error message") +class TestJobsStatusProperties: + """Tests for status filter properties.""" - # refuse jobs with errors - for key in subject.fetch("KEY"): - assert not schema_any.jobs.reserve(table_name, key), "failed to ignore error jobs" + def test_pending_property(self, schema_any): + """Test that pending property returns pending jobs.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + assert len(jobs.pending) > 0 + statuses = jobs.pending.fetch("status") + assert all(s == "pending" for s in statuses) + + def test_reserved_property(self, schema_any): + """Test that reserved property returns reserved jobs.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() - # clear error jobs - (schema_any.jobs & dict(status="error")).delete() - assert not schema_any.jobs, "failed to clear error jobs" + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + assert len(jobs.reserved) == 1 + statuses = jobs.reserved.fetch("status") + assert all(s == "reserved" for s in statuses) -def test_restrictions(clean_jobs, schema_any): - jobs = schema_any.jobs - jobs.delete() - jobs.reserve("a", {"key": "a1"}) - jobs.reserve("a", {"key": "a2"}) - jobs.reserve("b", {"key": "b1"}) - jobs.error("a", {"key": "a2"}, "error") - jobs.error("b", {"key": "b1"}, "error") + def test_errors_property(self, schema_any): + """Test that errors property returns error jobs.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() - assert len(jobs & {"table_name": "a"}) == 2 - assert len(jobs & {"status": "error"}) == 2 - assert len(jobs & {"table_name": "a", "status": "error"}) == 1 - jobs.delete() + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.error(key, error_message="test") + + assert len(jobs.errors) == 1 + def test_ignored_property(self, schema_any): + """Test that ignored property returns ignored jobs.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() -def test_sigint(clean_jobs, schema_any): - try: - schema.SigIntTable().populate(reserve_jobs=True) - except KeyboardInterrupt: - pass + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.ignore(key) + + assert len(jobs.ignored) == 1 + + +class TestJobsProgress: + """Tests for JobsTable.progress() method.""" + + def test_progress_returns_counts(self, schema_any): + """Test that progress() returns status counts.""" + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + progress = jobs.progress() - assert len(schema_any.jobs.fetch()), "SigInt jobs table is empty" - status, error_message = schema_any.jobs.fetch1("status", "error_message") - assert status == "error" - assert error_message == "KeyboardInterrupt" + assert "pending" in progress + assert "reserved" in progress + assert "success" in progress + assert "error" in progress + assert "ignore" in progress + assert "total" in progress + assert progress["total"] == sum(progress[k] for k in ["pending", "reserved", "success", "error", "ignore"]) + + +class TestPopulateWithJobs: + """Tests for populate() with reserve_jobs=True using new system.""" + def test_populate_creates_jobs_table(self, schema_any): + """Test that populate with reserve_jobs creates jobs table.""" + table = schema.SigIntTable() + # Clear target table to allow re-population + table.delete() + + # First populate should create jobs table + table.populate(reserve_jobs=True, suppress_errors=True, max_calls=1) + + assert table.jobs.is_declared -def test_sigterm(clean_jobs, schema_any): - try: - schema.SigTermTable().populate(reserve_jobs=True) - except SystemExit: + def test_populate_uses_jobs_queue(self, schema_any): + """Test that populate processes jobs from queue.""" + table = schema.Experiment() + table.delete() + jobs = table.jobs + jobs.delete() + + # Refresh to add jobs + jobs.refresh() + initial_pending = len(jobs.pending) + assert initial_pending > 0 + + # Populate one job + result = table.populate(reserve_jobs=True, max_calls=1) + assert result["success_count"] >= 0 # May be 0 if error + + def test_populate_with_priority_filter(self, schema_any): + """Test that populate respects priority filter.""" + table = schema.Experiment() + table.delete() + jobs = table.jobs + jobs.delete() + + # Add jobs with different priorities + # This would require the table to have multiple keys + pass # Skip for now + + +class TestSchemaJobs: + """Tests for schema.jobs property.""" + + def test_schema_jobs_returns_list(self, schema_any): + """Test that schema.jobs returns list of JobsTable objects.""" + jobs_list = schema_any.jobs + assert isinstance(jobs_list, list) + + def test_schema_jobs_contains_jobs_tables(self, schema_any): + """Test that schema.jobs contains JobsTable instances.""" + jobs_list = schema_any.jobs + for jobs in jobs_list: + assert isinstance(jobs, JobsTable) + + +class TestTableDropLifecycle: + """Tests for table drop lifecycle.""" + + def test_drop_removes_jobs_table(self, schema_any): + """Test that dropping a table also drops its jobs table.""" + # Create a temporary computed table for this test + # This test would modify the schema, so skip for now pass - assert len(schema_any.jobs.fetch()), "SigTerm jobs table is empty" - status, error_message = schema_any.jobs.fetch1("status", "error_message") - assert status == "error" - assert error_message == "SystemExit: SIGTERM received" - - -def test_suppress_dj_errors(clean_jobs, schema_any): - """test_suppress_dj_errors: dj errors suppressible w/o native py blobs""" - with dj.config.override(enable_python_native_blobs=False): - schema.ErrorClass.populate(reserve_jobs=True, suppress_errors=True) - assert len(schema.DjExceptionName()) == len(schema_any.jobs) > 0 - - -def test_long_error_message(clean_jobs, subject, schema_any): - # create long error message - long_error_message = "".join(random.choice(string.ascii_letters) for _ in range(ERROR_MESSAGE_LENGTH + 100)) - short_error_message = "".join(random.choice(string.ascii_letters) for _ in range(ERROR_MESSAGE_LENGTH // 2)) - assert subject - table_name = "fake_table" - - key = subject.fetch("KEY", limit=1)[0] - - # test long error message - schema_any.jobs.reserve(table_name, key) - schema_any.jobs.error(table_name, key, long_error_message) - error_message = schema_any.jobs.fetch1("error_message") - assert len(error_message) == ERROR_MESSAGE_LENGTH, "error message is longer than max allowed" - assert error_message.endswith(TRUNCATION_APPENDIX), "appropriate ending missing for truncated error message" - schema_any.jobs.delete() - - # test long error message - schema_any.jobs.reserve(table_name, key) - schema_any.jobs.error(table_name, key, short_error_message) - error_message = schema_any.jobs.fetch1("error_message") - assert error_message == short_error_message, "error messages do not agree" - assert not error_message.endswith(TRUNCATION_APPENDIX), "error message should not be truncated" - schema_any.jobs.delete() - - -def test_long_error_stack(clean_jobs, subject, schema_any): - # create long error stack - STACK_SIZE = 89942 # Does not fit into small blob (should be 64k, but found to be higher) - long_error_stack = "".join(random.choice(string.ascii_letters) for _ in range(STACK_SIZE)) - assert subject - table_name = "fake_table" - - key = subject.fetch("KEY", limit=1)[0] - - # test long error stack - schema_any.jobs.reserve(table_name, key) - schema_any.jobs.error(table_name, key, "error message", long_error_stack) - error_stack = schema_any.jobs.fetch1("error_stack") - assert error_stack == long_error_stack, "error stacks do not agree" + +class TestConfiguration: + """Tests for jobs configuration settings.""" + + def test_default_priority_config(self, schema_any): + """Test that config.jobs.default_priority is used.""" + original = dj.config.jobs.default_priority + try: + dj.config.jobs.default_priority = 3 + + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() # Should use default priority from config + + priorities = jobs.pending.fetch("priority") + assert all(p == 3 for p in priorities) + finally: + dj.config.jobs.default_priority = original + + def test_keep_completed_config(self, schema_any): + """Test that config.jobs.keep_completed affects complete().""" + # Test with keep_completed=True + with dj.config.override(jobs__keep_completed=True): + table = schema.SigIntTable() + jobs = table.jobs + jobs.delete() + jobs.refresh() + + key = jobs.pending.fetch("KEY", limit=1)[0] + jobs.reserve(key) + jobs.complete(key) # Should use config + + status = (jobs & key).fetch1("status") + assert status == "success" diff --git a/tests/test_object.py b/tests/test_object.py new file mode 100644 index 000000000..c2fd18cf6 --- /dev/null +++ b/tests/test_object.py @@ -0,0 +1,852 @@ +""" +Tests for the object column type. + +Tests cover: +- Storage path generation +- Insert with file, folder, and stream +- Fetch returning ObjectRef +- ObjectRef methods (read, open, download, listdir, walk, verify) +- Staged insert +- Error cases +""" + +import io +import json +import os +from pathlib import Path + +import pytest + +import datajoint as dj +from datajoint.objectref import ObjectRef +from datajoint.storage import build_object_path, generate_token, encode_pk_value + +from .schema_object import ObjectFile, ObjectFolder, ObjectMultiple, ObjectWithOther + + +class TestStoragePathGeneration: + """Tests for storage path generation utilities.""" + + def test_generate_token_default_length(self): + """Test token generation with default length.""" + token = generate_token() + assert len(token) == 8 + # All characters should be URL-safe + safe_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" + assert all(c in safe_chars for c in token) + + def test_generate_token_custom_length(self): + """Test token generation with custom length.""" + token = generate_token(12) + assert len(token) == 12 + + def test_generate_token_minimum_length(self): + """Test token generation respects minimum length.""" + token = generate_token(2) # Below minimum + assert len(token) == 4 # Should be clamped to minimum + + def test_generate_token_maximum_length(self): + """Test token generation respects maximum length.""" + token = generate_token(20) # Above maximum + assert len(token) == 16 # Should be clamped to maximum + + def test_generate_token_uniqueness(self): + """Test that generated tokens are unique.""" + tokens = [generate_token() for _ in range(100)] + assert len(set(tokens)) == 100 + + def test_encode_pk_value_integer(self): + """Test encoding integer primary key values.""" + assert encode_pk_value(123) == "123" + assert encode_pk_value(0) == "0" + assert encode_pk_value(-5) == "-5" + + def test_encode_pk_value_string(self): + """Test encoding string primary key values.""" + assert encode_pk_value("simple") == "simple" + assert encode_pk_value("test_value") == "test_value" + + def test_encode_pk_value_unsafe_chars(self): + """Test encoding strings with unsafe characters.""" + # Slash should be URL-encoded + result = encode_pk_value("path/to/file") + assert "/" not in result or result == "path%2Fto%2Ffile" + + def test_build_object_path_basic(self): + """Test basic object path building.""" + path, token = build_object_path( + schema="myschema", + table="MyTable", + field="data_file", + primary_key={"id": 123}, + ext=".dat", + ) + assert "myschema" in path + assert "MyTable" in path + assert "objects" in path + assert "id=123" in path + assert "data_file_" in path + assert path.endswith(".dat") + assert len(token) == 8 + + def test_build_object_path_no_extension(self): + """Test object path building without extension.""" + path, token = build_object_path( + schema="myschema", + table="MyTable", + field="data_folder", + primary_key={"id": 456}, + ext=None, + ) + assert not path.endswith(".") + assert "data_folder_" in path + + def test_build_object_path_multiple_pk(self): + """Test object path with multiple primary key attributes.""" + path, token = build_object_path( + schema="myschema", + table="MyTable", + field="raw_data", + primary_key={"subject_id": 1, "session_id": 2}, + ext=".zarr", + ) + assert "subject_id=1" in path + assert "session_id=2" in path + + def test_build_object_path_with_partition(self): + """Test object path with partition pattern.""" + path, token = build_object_path( + schema="myschema", + table="MyTable", + field="data", + primary_key={"subject_id": 1, "session_id": 2}, + ext=".dat", + partition_pattern="{subject_id}", + ) + # subject_id should be at the beginning due to partition + assert path.startswith("subject_id=1") + + +class TestObjectRef: + """Tests for ObjectRef class.""" + + def test_from_json_string(self): + """Test creating ObjectRef from JSON string.""" + json_str = json.dumps( + { + "path": "schema/Table/objects/id=1/data_abc123.dat", + "size": 1024, + "hash": None, + "ext": ".dat", + "is_dir": False, + "timestamp": "2025-01-15T10:30:00+00:00", + } + ) + obj = ObjectRef.from_json(json_str) + assert obj.path == "schema/Table/objects/id=1/data_abc123.dat" + assert obj.size == 1024 + assert obj.hash is None + assert obj.ext == ".dat" + assert obj.is_dir is False + + def test_from_json_dict(self): + """Test creating ObjectRef from dict.""" + data = { + "path": "schema/Table/objects/id=1/data_abc123.zarr", + "size": 5678, + "hash": None, + "ext": ".zarr", + "is_dir": True, + "timestamp": "2025-01-15T10:30:00+00:00", + "item_count": 42, + } + obj = ObjectRef.from_json(data) + assert obj.path == "schema/Table/objects/id=1/data_abc123.zarr" + assert obj.size == 5678 + assert obj.is_dir is True + assert obj.item_count == 42 + + def test_from_json_zarr_style(self): + """Test creating ObjectRef from Zarr-style JSON with null size.""" + data = { + "path": "schema/Recording/objects/id=1/neural_data_abc123.zarr", + "size": None, + "hash": None, + "ext": ".zarr", + "is_dir": True, + "timestamp": "2025-01-15T10:30:00+00:00", + } + obj = ObjectRef.from_json(data) + assert obj.path == "schema/Recording/objects/id=1/neural_data_abc123.zarr" + assert obj.size is None + assert obj.hash is None + assert obj.ext == ".zarr" + assert obj.is_dir is True + assert obj.item_count is None + + def test_to_json(self): + """Test converting ObjectRef to JSON dict.""" + from datetime import datetime, timezone + + obj = ObjectRef( + path="schema/Table/objects/id=1/data.dat", + size=1024, + hash=None, + ext=".dat", + is_dir=False, + timestamp=datetime(2025, 1, 15, 10, 30, tzinfo=timezone.utc), + ) + data = obj.to_json() + assert data["path"] == "schema/Table/objects/id=1/data.dat" + assert data["size"] == 1024 + assert data["is_dir"] is False + + def test_repr_file(self): + """Test string representation for file.""" + from datetime import datetime, timezone + + obj = ObjectRef( + path="test/path.dat", + size=1024, + hash=None, + ext=".dat", + is_dir=False, + timestamp=datetime.now(timezone.utc), + ) + assert "file" in repr(obj) + assert "test/path.dat" in repr(obj) + + def test_repr_folder(self): + """Test string representation for folder.""" + from datetime import datetime, timezone + + obj = ObjectRef( + path="test/folder.zarr", + size=5678, + hash=None, + ext=".zarr", + is_dir=True, + timestamp=datetime.now(timezone.utc), + ) + assert "folder" in repr(obj) + + def test_str(self): + """Test str() returns path.""" + from datetime import datetime, timezone + + obj = ObjectRef( + path="my/path/to/data.dat", + size=100, + hash=None, + ext=".dat", + is_dir=False, + timestamp=datetime.now(timezone.utc), + ) + assert str(obj) == "my/path/to/data.dat" + + +class TestObjectInsertFile: + """Tests for inserting files with object type.""" + + def test_insert_file(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test inserting a file.""" + table = ObjectFile() + + # Create a test file + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "test_data.dat") + data = os.urandom(1024) + with test_file.open("wb") as f: + f.write(data) + + # Insert the file + table.insert1({"file_id": 1, "data_file": str(test_file)}) + + # Verify record was inserted + assert len(table) == 1 + + # Cleanup + table.delete() + + def test_insert_file_with_extension(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test that file extension is preserved.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "data.csv") + test_file.write_text("a,b,c\n1,2,3\n") + + table.insert1({"file_id": 2, "data_file": str(test_file)}) + + # Fetch and check extension in metadata + record = table.fetch1() + obj = record["data_file"] + assert obj.ext == ".csv" + + table.delete() + + def test_insert_file_nonexistent(self, schema_obj, mock_object_storage): + """Test that inserting nonexistent file raises error.""" + table = ObjectFile() + + with pytest.raises(dj.DataJointError, match="not found"): + table.insert1({"file_id": 3, "data_file": "/nonexistent/path/file.dat"}) + + +class TestObjectInsertFolder: + """Tests for inserting folders with object type.""" + + def test_insert_folder(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test inserting a folder.""" + table = ObjectFolder() + + # Create a test folder with files + source_folder = tmpdir_factory.mktemp("source") + data_folder = Path(source_folder, "data_folder") + data_folder.mkdir() + + # Add some files + (data_folder / "file1.txt").write_text("content1") + (data_folder / "file2.txt").write_text("content2") + subdir = data_folder / "subdir" + subdir.mkdir() + (subdir / "file3.txt").write_text("content3") + + # Insert the folder + table.insert1({"folder_id": 1, "data_folder": str(data_folder)}) + + assert len(table) == 1 + + # Fetch and verify + record = table.fetch1() + obj = record["data_folder"] + assert obj.is_dir is True + assert obj.item_count == 3 # 3 files + + table.delete() + + +class TestObjectInsertStream: + """Tests for inserting from streams with object type.""" + + def test_insert_stream(self, schema_obj, mock_object_storage): + """Test inserting from a stream.""" + table = ObjectFile() + + # Create a BytesIO stream + data = b"This is test data from a stream" + stream = io.BytesIO(data) + + # Insert with extension and stream tuple + table.insert1({"file_id": 10, "data_file": (".txt", stream)}) + + assert len(table) == 1 + + # Fetch and verify extension + record = table.fetch1() + obj = record["data_file"] + assert obj.ext == ".txt" + assert obj.size == len(data) + + table.delete() + + +class TestObjectFetch: + """Tests for fetching object type attributes.""" + + def test_fetch_returns_objectref(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test that fetch returns ObjectRef.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "test.dat") + test_file.write_bytes(os.urandom(512)) + + table.insert1({"file_id": 20, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + assert isinstance(obj, ObjectRef) + assert obj.size == 512 + assert obj.is_dir is False + + table.delete() + + def test_fetch_metadata_no_io(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test that accessing metadata does not perform I/O.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "test.dat") + test_file.write_bytes(os.urandom(256)) + + table.insert1({"file_id": 21, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + # These should all work without I/O + assert obj.path is not None + assert obj.size == 256 + assert obj.ext == ".dat" + assert obj.is_dir is False + assert obj.timestamp is not None + + table.delete() + + +class TestObjectRefOperations: + """Tests for ObjectRef file operations.""" + + def test_read_file(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test reading file content via ObjectRef.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "readable.dat") + original_data = os.urandom(128) + test_file.write_bytes(original_data) + + table.insert1({"file_id": 30, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + # Read content + content = obj.read() + assert content == original_data + + table.delete() + + def test_open_file(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test opening file via ObjectRef.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "openable.txt") + test_file.write_text("Hello, World!") + + table.insert1({"file_id": 31, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + # Open and read + with obj.open(mode="rb") as f: + content = f.read() + assert content == b"Hello, World!" + + table.delete() + + def test_download_file(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test downloading file via ObjectRef.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "downloadable.dat") + original_data = os.urandom(256) + test_file.write_bytes(original_data) + + table.insert1({"file_id": 32, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + # Download to new location + download_folder = tmpdir_factory.mktemp("download") + local_path = obj.download(download_folder) + + assert Path(local_path).exists() + assert Path(local_path).read_bytes() == original_data + + table.delete() + + def test_exists(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test exists() method.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "exists.dat") + test_file.write_bytes(b"data") + + table.insert1({"file_id": 33, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + assert obj.exists() is True + + table.delete() + + +class TestObjectRefFolderOperations: + """Tests for ObjectRef folder operations.""" + + def test_listdir(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test listing folder contents.""" + table = ObjectFolder() + + source_folder = tmpdir_factory.mktemp("source") + data_folder = Path(source_folder, "listable") + data_folder.mkdir() + (data_folder / "a.txt").write_text("a") + (data_folder / "b.txt").write_text("b") + (data_folder / "c.txt").write_text("c") + + table.insert1({"folder_id": 40, "data_folder": str(data_folder)}) + + record = table.fetch1() + obj = record["data_folder"] + + contents = obj.listdir() + assert len(contents) == 3 + assert "a.txt" in contents + assert "b.txt" in contents + assert "c.txt" in contents + + table.delete() + + def test_walk(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test walking folder tree.""" + table = ObjectFolder() + + source_folder = tmpdir_factory.mktemp("source") + data_folder = Path(source_folder, "walkable") + data_folder.mkdir() + (data_folder / "root.txt").write_text("root") + subdir = data_folder / "subdir" + subdir.mkdir() + (subdir / "nested.txt").write_text("nested") + + table.insert1({"folder_id": 41, "data_folder": str(data_folder)}) + + record = table.fetch1() + obj = record["data_folder"] + + # Collect walk results + walk_results = list(obj.walk()) + assert len(walk_results) >= 1 + + table.delete() + + def test_open_subpath(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test opening file within folder using subpath.""" + table = ObjectFolder() + + source_folder = tmpdir_factory.mktemp("source") + data_folder = Path(source_folder, "subpathable") + data_folder.mkdir() + (data_folder / "inner.txt").write_text("inner content") + + table.insert1({"folder_id": 42, "data_folder": str(data_folder)}) + + record = table.fetch1() + obj = record["data_folder"] + + with obj.open("inner.txt", mode="rb") as f: + content = f.read() + assert content == b"inner content" + + table.delete() + + def test_read_on_folder_raises(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test that read() on folder raises error.""" + table = ObjectFolder() + + source_folder = tmpdir_factory.mktemp("source") + data_folder = Path(source_folder, "folder") + data_folder.mkdir() + (data_folder / "file.txt").write_text("content") + + table.insert1({"folder_id": 43, "data_folder": str(data_folder)}) + + record = table.fetch1() + obj = record["data_folder"] + + with pytest.raises(dj.DataJointError, match="Cannot read"): + obj.read() + + table.delete() + + def test_listdir_on_file_raises(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test that listdir() on file raises error.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "file.dat") + test_file.write_bytes(b"data") + + table.insert1({"file_id": 44, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + with pytest.raises(dj.DataJointError, match="Cannot listdir"): + obj.listdir() + + table.delete() + + +class TestObjectMultiple: + """Tests for tables with multiple object attributes.""" + + def test_multiple_objects(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test inserting multiple object attributes.""" + table = ObjectMultiple() + + source_folder = tmpdir_factory.mktemp("source") + raw_file = Path(source_folder, "raw.dat") + raw_file.write_bytes(os.urandom(100)) + processed_file = Path(source_folder, "processed.dat") + processed_file.write_bytes(os.urandom(200)) + + table.insert1( + { + "record_id": 1, + "raw_data": str(raw_file), + "processed": str(processed_file), + } + ) + + record = table.fetch1() + raw_obj = record["raw_data"] + processed_obj = record["processed"] + + assert raw_obj.size == 100 + assert processed_obj.size == 200 + assert raw_obj.path != processed_obj.path + + table.delete() + + +class TestObjectWithOtherAttributes: + """Tests for object type mixed with other attributes.""" + + def test_object_with_other(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test table with object and other attribute types.""" + table = ObjectWithOther() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "data.bin") + test_file.write_bytes(os.urandom(64)) + + table.insert1( + { + "subject_id": 1, + "session_id": 1, + "name": "Test Session", + "data_file": str(test_file), + "notes": "Some notes here", + } + ) + + record = table.fetch1() + assert record["name"] == "Test Session" + assert record["notes"] == "Some notes here" + assert isinstance(record["data_file"], ObjectRef) + assert record["data_file"].size == 64 + + table.delete() + + +class TestObjectVerify: + """Tests for ObjectRef verification.""" + + def test_verify_file(self, schema_obj, mock_object_storage, tmpdir_factory): + """Test verifying file integrity.""" + table = ObjectFile() + + source_folder = tmpdir_factory.mktemp("source") + test_file = Path(source_folder, "verifiable.dat") + test_file.write_bytes(os.urandom(128)) + + table.insert1({"file_id": 50, "data_file": str(test_file)}) + + record = table.fetch1() + obj = record["data_file"] + + # Should not raise + assert obj.verify() is True + + table.delete() + + +class TestStagedInsert: + """Tests for staged insert operations.""" + + def test_staged_insert_basic(self, schema_obj, mock_object_storage): + """Test basic staged insert.""" + table = ObjectFile() + + with table.staged_insert1 as staged: + staged.rec["file_id"] = 60 + + # Write directly to storage + with staged.open("data_file", ".dat") as f: + f.write(b"staged data content") + + # No need to assign - metadata computed on exit + + # Verify record was inserted + assert len(table) == 1 + record = table.fetch1() + obj = record["data_file"] + assert obj.ext == ".dat" + + table.delete() + + def test_staged_insert_exception_cleanup(self, schema_obj, mock_object_storage): + """Test that staged insert cleans up on exception.""" + table = ObjectFile() + + try: + with table.staged_insert1 as staged: + staged.rec["file_id"] = 61 + + with staged.open("data_file", ".dat") as f: + f.write(b"will be cleaned up") + + raise ValueError("Simulated error") + except ValueError: + pass + + # No record should be inserted + assert len(table) == 0 + + def test_staged_insert_store_method(self, schema_obj, mock_object_storage): + """Test staged insert store() method returns FSMap.""" + import fsspec + + table = ObjectFile() + + with table.staged_insert1 as staged: + staged.rec["file_id"] = 62 + + store = staged.store("data_file", ".zarr") + assert isinstance(store, fsspec.FSMap) + + # Write some data + store["test_key"] = b"test_value" + + assert len(table) == 1 + + table.delete() + + def test_staged_insert_fs_property(self, schema_obj, mock_object_storage): + """Test staged insert fs property returns filesystem.""" + import fsspec + + table = ObjectFile() + + with table.staged_insert1 as staged: + staged.rec["file_id"] = 63 + + fs = staged.fs + assert isinstance(fs, fsspec.AbstractFileSystem) + + # Just open and write to test fs works + with staged.open("data_file", ".txt") as f: + f.write(b"test") + + table.delete() + + def test_staged_insert_missing_pk_raises(self, schema_obj, mock_object_storage): + """Test that staged insert raises if PK not set before store().""" + table = ObjectFile() + + with pytest.raises(dj.DataJointError, match="Primary key"): + with table.staged_insert1 as staged: + # Don't set primary key + staged.store("data_file", ".dat") + + +class TestRemoteURLSupport: + """Tests for remote URL detection and parsing.""" + + def test_is_remote_url_s3(self): + """Test S3 URL detection.""" + from datajoint.storage import is_remote_url + + assert is_remote_url("s3://bucket/path/file.dat") is True + assert is_remote_url("S3://bucket/path/file.dat") is True + + def test_is_remote_url_gcs(self): + """Test GCS URL detection.""" + from datajoint.storage import is_remote_url + + assert is_remote_url("gs://bucket/path/file.dat") is True + assert is_remote_url("gcs://bucket/path/file.dat") is True + + def test_is_remote_url_azure(self): + """Test Azure URL detection.""" + from datajoint.storage import is_remote_url + + assert is_remote_url("az://container/path/file.dat") is True + assert is_remote_url("abfs://container/path/file.dat") is True + + def test_is_remote_url_http(self): + """Test HTTP/HTTPS URL detection.""" + from datajoint.storage import is_remote_url + + assert is_remote_url("http://example.com/path/file.dat") is True + assert is_remote_url("https://example.com/path/file.dat") is True + + def test_is_remote_url_local_path(self): + """Test local paths are not detected as remote.""" + from datajoint.storage import is_remote_url + + assert is_remote_url("/local/path/file.dat") is False + assert is_remote_url("relative/path/file.dat") is False + assert is_remote_url("C:\\Windows\\path\\file.dat") is False + + def test_is_remote_url_non_string(self): + """Test non-string inputs return False.""" + from datajoint.storage import is_remote_url + + assert is_remote_url(None) is False + assert is_remote_url(123) is False + assert is_remote_url(Path("/local/path")) is False + + def test_parse_remote_url_s3(self): + """Test S3 URL parsing.""" + from datajoint.storage import parse_remote_url + + protocol, path = parse_remote_url("s3://bucket/path/file.dat") + assert protocol == "s3" + assert path == "bucket/path/file.dat" + + def test_parse_remote_url_gcs(self): + """Test GCS URL parsing.""" + from datajoint.storage import parse_remote_url + + protocol, path = parse_remote_url("gs://bucket/path/file.dat") + assert protocol == "gcs" + assert path == "bucket/path/file.dat" + + protocol, path = parse_remote_url("gcs://bucket/path/file.dat") + assert protocol == "gcs" + assert path == "bucket/path/file.dat" + + def test_parse_remote_url_azure(self): + """Test Azure URL parsing.""" + from datajoint.storage import parse_remote_url + + protocol, path = parse_remote_url("az://container/path/file.dat") + assert protocol == "abfs" + assert path == "container/path/file.dat" + + def test_parse_remote_url_http(self): + """Test HTTP URL parsing.""" + from datajoint.storage import parse_remote_url + + protocol, path = parse_remote_url("https://example.com/path/file.dat") + assert protocol == "https" + assert path == "example.com/path/file.dat" + + def test_parse_remote_url_unsupported(self): + """Test unsupported protocol raises error.""" + from datajoint.storage import parse_remote_url + + with pytest.raises(dj.DataJointError, match="Unsupported remote URL"): + parse_remote_url("ftp://server/path/file.dat")