From 2be5f11043f5f3a7ef14c504eff41453e06539cb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 02:18:03 +0000
Subject: [PATCH 01/41] Introduce AttributeType system to replace
 AttributeAdapter

This commit introduces a modern, extensible custom type system for DataJoint:

**New Features:**
- AttributeType base class with encode()/decode() methods
- Global type registry with @register_type decorator
- Entry point discovery for third-party type packages (datajoint.types)
- Type chaining: dtype can reference another custom type
- Automatic validation via validate() method before encoding
- resolve_dtype() for resolving chained types

**API Changes:**
- New: dj.AttributeType, dj.register_type, dj.list_types
- AttributeAdapter is now deprecated (backward-compatible wrapper)
- Feature flag DJ_SUPPORT_ADAPTED_TYPES is no longer required

**Entry Point Specification:**
Third-party packages can declare types in pyproject.toml:
  [project.entry-points."datajoint.types"]
  zarr_array = "dj_zarr:ZarrArrayType"

**Migration Path:**
Old AttributeAdapter subclasses continue to work but emit
DeprecationWarning. Migrate to AttributeType with encode/decode.
---
 src/datajoint/__init__.py          |   6 +-
 src/datajoint/attribute_adapter.py | 188 +++++++++++--
 src/datajoint/attribute_type.py    | 413 +++++++++++++++++++++++++++++
 src/datajoint/declare.py           |   4 +-
 src/datajoint/fetch.py             |   5 +-
 src/datajoint/heading.py           |  43 ++-
 src/datajoint/table.py             |   4 +-
 tests/conftest.py                  |  11 +-
 tests/test_adapted_attributes.py   |  22 +-
 tests/test_attribute_type.py       | 347 ++++++++++++++++++++++++
 10 files changed, 993 insertions(+), 50 deletions(-)
 create mode 100644 src/datajoint/attribute_type.py
 create mode 100644 tests/test_attribute_type.py

diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py
index 0f8123c66..feff400bf 100644
--- a/src/datajoint/__init__.py
+++ b/src/datajoint/__init__.py
@@ -45,7 +45,10 @@
     "kill",
     "MatCell",
     "MatStruct",
-    "AttributeAdapter",
+    "AttributeType",
+    "register_type",
+    "list_types",
+    "AttributeAdapter",  # Deprecated, use AttributeType
     "errors",
     "DataJointError",
     "key",
@@ -57,6 +60,7 @@
 from . import errors
 from .admin import kill
 from .attribute_adapter import AttributeAdapter
+from .attribute_type import AttributeType, list_types, register_type
 from .blob import MatCell, MatStruct
 from .cli import cli
 from .connection import Connection, conn
diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
index 12a34f27e..5c687bff6 100644
--- a/src/datajoint/attribute_adapter.py
+++ b/src/datajoint/attribute_adapter.py
@@ -1,61 +1,191 @@
+"""
+Legacy attribute adapter module.
+
+This module provides backward compatibility for the deprecated AttributeAdapter class.
+New code should use :class:`datajoint.AttributeType` instead.
+
+.. deprecated:: 0.15
+    Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods.
+"""
+
 import re
+import warnings
+from typing import Any
 
-from .errors import DataJointError, _support_adapted_types
+from .attribute_type import AttributeType, get_type, is_type_registered
+from .errors import DataJointError
 
 
-class AttributeAdapter:
+class AttributeAdapter(AttributeType):
     """
-    Base class for adapter objects for user-defined attribute types.
+    Legacy base class for attribute adapters.
+
+    .. deprecated:: 0.15
+        Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods instead.
+
+    This class provides backward compatibility for existing adapters that use
+    the ``attribute_type``, ``put()``, and ``get()`` API.
+
+    Migration guide::
+
+        # Old style (deprecated):
+        class GraphAdapter(dj.AttributeAdapter):
+            attribute_type = "longblob"
+
+            def put(self, graph):
+                return list(graph.edges)
+
+            def get(self, edges):
+                return nx.Graph(edges)
+
+        # New style (recommended):
+        @dj.register_type
+        class GraphType(dj.AttributeType):
+            type_name = "graph"
+            dtype = "longblob"
+
+            def encode(self, graph, *, key=None):
+                return list(graph.edges)
+
+            def decode(self, edges, *, key=None):
+                return nx.Graph(edges)
     """
 
+    # Subclasses can set this as a class attribute instead of property
+    attribute_type: str = None  # type: ignore
+
+    def __init__(self):
+        # Emit deprecation warning on instantiation
+        warnings.warn(
+            f"{self.__class__.__name__} uses the deprecated AttributeAdapter API. "
+            "Migrate to AttributeType with encode/decode methods.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
     @property
-    def attribute_type(self):
+    def type_name(self) -> str:
         """
-        :return: a supported DataJoint attribute type to use; e.g. "longblob", "blob@store"
+        Infer type name from class name for legacy adapters.
+
+        Legacy adapters were identified by their variable name in the context dict,
+        not by a property. For backward compatibility, we use the lowercase class name.
         """
-        raise NotImplementedError("Undefined attribute adapter")
+        # Check if a _type_name was explicitly set (for context-based lookup)
+        if hasattr(self, "_type_name"):
+            return self._type_name
+        # Fall back to class name
+        return self.__class__.__name__.lower()
 
-    def get(self, value):
+    @property
+    def dtype(self) -> str:
+        """Map legacy attribute_type to new dtype property."""
+        attr_type = self.attribute_type
+        if attr_type is None:
+            raise NotImplementedError(
+                f"{self.__class__.__name__} must define 'attribute_type' "
+                "(or migrate to AttributeType with 'dtype')"
+            )
+        return attr_type
+
+    def encode(self, value: Any, *, key: dict | None = None) -> Any:
+        """Delegate to legacy put() method."""
+        return self.put(value)
+
+    def decode(self, stored: Any, *, key: dict | None = None) -> Any:
+        """Delegate to legacy get() method."""
+        return self.get(stored)
+
+    def put(self, obj: Any) -> Any:
         """
-        convert value retrieved from the the attribute in a table into the adapted type
+        Convert an object of the adapted type into a storable value.
+
+        .. deprecated:: 0.15
+            Override ``encode()`` instead.
 
-        :param value: value from the database
+        Args:
+            obj: An object of the adapted type.
 
-        :return: object of the adapted type
+        Returns:
+            Value to store in the database.
         """
-        raise NotImplementedError("Undefined attribute adapter")
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must implement put() or migrate to encode()"
+        )
 
-    def put(self, obj):
+    def get(self, value: Any) -> Any:
         """
-        convert an object of the adapted type into a value that DataJoint can store in a table attribute
+        Convert a value from the database into the adapted type.
+
+        .. deprecated:: 0.15
+            Override ``decode()`` instead.
+
+        Args:
+            value: Value from the database.
 
-        :param obj: an object of the adapted type
-        :return: value to store in the database
+        Returns:
+            Object of the adapted type.
         """
-        raise NotImplementedError("Undefined attribute adapter")
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must implement get() or migrate to decode()"
+        )
 
 
-def get_adapter(context, adapter_name):
+def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
     """
-    Extract the AttributeAdapter object by its name from the context and validate.
+    Get an attribute type/adapter by name.
+
+    This function provides backward compatibility by checking both:
+    1. The global type registry (new system)
+    2. The schema context dict (legacy system)
+
+    Args:
+        context: Schema context dictionary (for legacy adapters).
+        adapter_name: The adapter/type name, with or without angle brackets.
+
+    Returns:
+        The AttributeType instance.
+
+    Raises:
+        DataJointError: If the adapter is not found or invalid.
     """
-    if not _support_adapted_types():
-        raise DataJointError("Support for Adapted Attribute types is disabled.")
     adapter_name = adapter_name.lstrip("<").rstrip(">")
+
+    # First, check the global type registry (new system)
+    if is_type_registered(adapter_name):
+        return get_type(adapter_name)
+
+    # Fall back to context-based lookup (legacy system)
+    if context is None:
+        raise DataJointError(
+            f"Attribute type <{adapter_name}> is not registered. "
+            "Use @dj.register_type to register custom types."
+        )
+
     try:
         adapter = context[adapter_name]
     except KeyError:
-        raise DataJointError("Attribute adapter '{adapter_name}' is not defined.".format(adapter_name=adapter_name))
-    if not isinstance(adapter, AttributeAdapter):
         raise DataJointError(
-            "Attribute adapter '{adapter_name}' must be an instance of datajoint.AttributeAdapter".format(
-                adapter_name=adapter_name
-            )
+            f"Attribute type <{adapter_name}> is not defined. "
+            "Register it with @dj.register_type or include it in the schema context."
         )
-    if not isinstance(adapter.attribute_type, str) or not re.match(r"^\w", adapter.attribute_type):
+
+    # Validate it's an AttributeType (or legacy AttributeAdapter)
+    if not isinstance(adapter, AttributeType):
         raise DataJointError(
-            "Invalid attribute type {type} in attribute adapter '{adapter_name}'".format(
-                type=adapter.attribute_type, adapter_name=adapter_name
-            )
+            f"Attribute adapter '{adapter_name}' must be an instance of "
+            "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)"
         )
+
+    # For legacy adapters from context, store the name they were looked up by
+    if isinstance(adapter, AttributeAdapter):
+        adapter._type_name = adapter_name
+
+    # Validate the dtype/attribute_type
+    dtype = adapter.dtype
+    if not isinstance(dtype, str) or not re.match(r"^\w", dtype):
+        raise DataJointError(
+            f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>"
+        )
+
     return adapter
diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
new file mode 100644
index 000000000..ac524d926
--- /dev/null
+++ b/src/datajoint/attribute_type.py
@@ -0,0 +1,413 @@
+"""
+Custom attribute type system for DataJoint.
+
+This module provides the AttributeType base class and registration mechanism
+for creating custom data types that extend DataJoint's native type system.
+
+Custom types enable seamless integration of complex Python objects (like NumPy arrays,
+graphs, or domain-specific structures) with DataJoint's relational storage.
+
+Example:
+    @dj.register_type
+    class GraphType(dj.AttributeType):
+        type_name = "graph"
+        dtype = "longblob"
+
+        def encode(self, graph: nx.Graph) -> list:
+            return list(graph.edges)
+
+        def decode(self, edges: list) -> nx.Graph:
+            return nx.Graph(edges)
+
+    # Then use in table definitions:
+    class MyTable(dj.Manual):
+        definition = '''
+        id : int
+        ---
+        data : <graph>
+        '''
+"""
+
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+from .errors import DataJointError
+
+if TYPE_CHECKING:
+    pass
+
+logger = logging.getLogger(__name__.split(".")[0])
+
+# Global type registry - maps type_name to AttributeType instance
+_type_registry: dict[str, AttributeType] = {}
+_entry_points_loaded: bool = False
+
+
+class AttributeType(ABC):
+    """
+    Base class for custom DataJoint attribute types.
+
+    Subclass this to create custom types that can be used in table definitions
+    with the ``<type_name>`` syntax. Custom types define bidirectional conversion
+    between Python objects and DataJoint's storage format.
+
+    Attributes:
+        type_name: Unique identifier used in ``<type_name>`` syntax
+        dtype: Underlying DataJoint storage type
+
+    Example:
+        @dj.register_type
+        class GraphType(dj.AttributeType):
+            type_name = "graph"
+            dtype = "longblob"
+
+            def encode(self, graph):
+                return list(graph.edges)
+
+            def decode(self, edges):
+                import networkx as nx
+                return nx.Graph(edges)
+
+    The type can then be used in table definitions::
+
+        class Connectivity(dj.Manual):
+            definition = '''
+            id : int
+            ---
+            graph_data : <graph>
+            '''
+    """
+
+    @property
+    @abstractmethod
+    def type_name(self) -> str:
+        """
+        Unique identifier for this type, used in table definitions as ``<type_name>``.
+
+        This name must be unique across all registered types. It should be lowercase
+        with underscores (e.g., "graph", "zarr_array", "compressed_image").
+
+        Returns:
+            The type name string without angle brackets.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def dtype(self) -> str:
+        """
+        The underlying DataJoint type used for storage.
+
+        Can be:
+            - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"``
+            - An external type: ``"blob@store"``, ``"attach@store"``
+            - The object type: ``"object"``
+            - Another custom type: ``"<other_type>"`` (enables type chaining)
+
+        Returns:
+            The storage type specification string.
+        """
+        ...
+
+    @abstractmethod
+    def encode(self, value: Any, *, key: dict | None = None) -> Any:
+        """
+        Convert a Python object to the storable format.
+
+        Called during INSERT operations to transform user-provided objects
+        into a format suitable for storage in the underlying ``dtype``.
+
+        Args:
+            value: The Python object to store.
+            key: Primary key values as a dict. Available when the dtype uses
+                 object storage and may be needed for path construction.
+
+        Returns:
+            Value in the format expected by ``dtype``. For example:
+                - For ``dtype="longblob"``: any picklable Python object
+                - For ``dtype="object"``: path string or file-like object
+                - For ``dtype="varchar(N)"``: string
+        """
+        ...
+
+    @abstractmethod
+    def decode(self, stored: Any, *, key: dict | None = None) -> Any:
+        """
+        Convert stored data back to a Python object.
+
+        Called during FETCH operations to reconstruct the original Python
+        object from the stored format.
+
+        Args:
+            stored: Data retrieved from storage. Type depends on ``dtype``:
+                - For ``"object"``: an ``ObjectRef`` handle
+                - For blob types: the unpacked Python object
+                - For native types: the native Python value (str, int, etc.)
+            key: Primary key values as a dict.
+
+        Returns:
+            The reconstructed Python object.
+        """
+        ...
+
+    def validate(self, value: Any) -> None:
+        """
+        Validate a value before encoding.
+
+        Override this method to add type checking or domain constraints.
+        Called automatically before ``encode()`` during INSERT operations.
+        The default implementation accepts any value.
+
+        Args:
+            value: The value to validate.
+
+        Raises:
+            TypeError: If the value has an incompatible type.
+            ValueError: If the value fails domain validation.
+        """
+        pass
+
+    def default(self) -> Any:
+        """
+        Return a default value for this type.
+
+        Override if the type has a sensible default value. The default
+        implementation raises NotImplementedError, indicating no default exists.
+
+        Returns:
+            The default value for this type.
+
+        Raises:
+            NotImplementedError: If no default exists (the default behavior).
+        """
+        raise NotImplementedError(f"No default value for type <{self.type_name}>")
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>"
+
+
+def register_type(cls: type[AttributeType]) -> type[AttributeType]:
+    """
+    Register a custom attribute type with DataJoint.
+
+    Can be used as a decorator or called directly. The type becomes available
+    for use in table definitions with the ``<type_name>`` syntax.
+
+    Args:
+        cls: An AttributeType subclass to register.
+
+    Returns:
+        The same class, unmodified (allows use as decorator).
+
+    Raises:
+        DataJointError: If a type with the same name is already registered
+            by a different class.
+        TypeError: If cls is not an AttributeType subclass.
+
+    Example:
+        As a decorator::
+
+            @dj.register_type
+            class GraphType(dj.AttributeType):
+                type_name = "graph"
+                ...
+
+        Or called directly::
+
+            dj.register_type(GraphType)
+    """
+    if not isinstance(cls, type) or not issubclass(cls, AttributeType):
+        raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}")
+
+    instance = cls()
+    name = instance.type_name
+
+    if not isinstance(name, str) or not name:
+        raise DataJointError(f"type_name must be a non-empty string, got {name!r}")
+
+    if name in _type_registry:
+        existing = _type_registry[name]
+        if type(existing) is not cls:
+            raise DataJointError(
+                f"Type <{name}> is already registered by "
+                f"{type(existing).__module__}.{type(existing).__name__}"
+            )
+        # Same class registered twice - idempotent, no error
+        return cls
+
+    _type_registry[name] = instance
+    logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}")
+    return cls
+
+
+def unregister_type(name: str) -> None:
+    """
+    Remove a type from the registry.
+
+    Primarily useful for testing. Use with caution in production code.
+
+    Args:
+        name: The type_name to unregister.
+
+    Raises:
+        DataJointError: If the type is not registered.
+    """
+    name = name.strip("<>")
+    if name not in _type_registry:
+        raise DataJointError(f"Type <{name}> is not registered")
+    del _type_registry[name]
+
+
+def get_type(name: str) -> AttributeType:
+    """
+    Retrieve a registered attribute type by name.
+
+    Looks up the type in the explicit registry first, then attempts
+    to load from installed packages via entry points.
+
+    Args:
+        name: The type name, with or without angle brackets.
+
+    Returns:
+        The registered AttributeType instance.
+
+    Raises:
+        DataJointError: If the type is not found.
+    """
+    name = name.strip("<>")
+
+    # Check explicit registry first
+    if name in _type_registry:
+        return _type_registry[name]
+
+    # Lazy-load entry points
+    _load_entry_points()
+
+    if name in _type_registry:
+        return _type_registry[name]
+
+    raise DataJointError(
+        f"Unknown attribute type: <{name}>. "
+        f"Ensure the type is registered via @dj.register_type or installed as a package."
+    )
+
+
+def list_types() -> list[str]:
+    """
+    List all registered type names.
+
+    Returns:
+        Sorted list of registered type names.
+    """
+    _load_entry_points()
+    return sorted(_type_registry.keys())
+
+
+def is_type_registered(name: str) -> bool:
+    """
+    Check if a type name is registered.
+
+    Args:
+        name: The type name to check.
+
+    Returns:
+        True if the type is registered.
+    """
+    name = name.strip("<>")
+    if name in _type_registry:
+        return True
+    _load_entry_points()
+    return name in _type_registry
+
+
+def _load_entry_points() -> None:
+    """
+    Load attribute types from installed packages via entry points.
+
+    Types are discovered from the ``datajoint.types`` entry point group.
+    Packages declare types in pyproject.toml::
+
+        [project.entry-points."datajoint.types"]
+        zarr_array = "dj_zarr:ZarrArrayType"
+
+    This function is idempotent - entry points are only loaded once.
+    """
+    global _entry_points_loaded
+    if _entry_points_loaded:
+        return
+
+    _entry_points_loaded = True
+
+    try:
+        from importlib.metadata import entry_points
+    except ImportError:
+        # Python < 3.10 fallback
+        try:
+            from importlib_metadata import entry_points
+        except ImportError:
+            logger.debug("importlib.metadata not available, skipping entry point discovery")
+            return
+
+    try:
+        # Python 3.10+ / importlib_metadata 3.6+
+        eps = entry_points(group="datajoint.types")
+    except TypeError:
+        # Older API
+        eps = entry_points().get("datajoint.types", [])
+
+    for ep in eps:
+        if ep.name in _type_registry:
+            # Already registered explicitly, skip entry point
+            continue
+        try:
+            type_class = ep.load()
+            register_type(type_class)
+            logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}")
+        except Exception as e:
+            logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}")
+
+
+def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[AttributeType]]:
+    """
+    Resolve a dtype string, following type chains.
+
+    If dtype references another custom type (e.g., "<other_type>"), recursively
+    resolves to find the ultimate storage type.
+
+    Args:
+        dtype: The dtype string to resolve.
+        seen: Set of already-seen type names (for cycle detection).
+
+    Returns:
+        Tuple of (final_storage_type, list_of_types_in_chain).
+        The chain is ordered from outermost to innermost type.
+
+    Raises:
+        DataJointError: If a circular type reference is detected.
+    """
+    if seen is None:
+        seen = set()
+
+    chain: list[AttributeType] = []
+
+    # Check if dtype is a custom type reference
+    if dtype.startswith("<") and dtype.endswith(">"):
+        type_name = dtype[1:-1]
+
+        if type_name in seen:
+            raise DataJointError(f"Circular type reference detected: <{type_name}>")
+
+        seen.add(type_name)
+        attr_type = get_type(type_name)
+        chain.append(attr_type)
+
+        # Recursively resolve the inner dtype
+        inner_dtype, inner_chain = resolve_dtype(attr_type.dtype, seen)
+        chain.extend(inner_chain)
+        return inner_dtype, chain
+
+    # Not a custom type - return as-is
+    return dtype, chain
diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index c1a22f0ca..995984389 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -480,8 +480,8 @@ def substitute_special_type(match, category, foreign_key_sql, context):
             "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match)
         )
     elif category == "ADAPTED":
-        adapter = get_adapter(context, match["type"])
-        match["type"] = adapter.attribute_type
+        attr_type = get_adapter(context, match["type"])
+        match["type"] = attr_type.dtype
         category = match_type(match["type"])
         if category in SPECIAL_TYPES:
             # recursive redefinition from user-defined datatypes.
diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index 5d02b52b0..0cac13632 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -53,8 +53,9 @@ def _get(connection, attr, data, squeeze, download_path):
 
     extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None
 
-    # apply attribute adapter if present
-    adapt = attr.adapter.get if attr.adapter else lambda x: x
+    # apply custom attribute type decoder if present
+    def adapt(x):
+        return attr.adapter.decode(x, key=None) if attr.adapter else x
 
     if attr.is_filepath:
         return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0])
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
index 45e35998c..1e40451ee 100644
--- a/src/datajoint/heading.py
+++ b/src/datajoint/heading.py
@@ -5,7 +5,8 @@
 
 import numpy as np
 
-from .attribute_adapter import AttributeAdapter, get_adapter
+from .attribute_adapter import get_adapter
+from .attribute_type import AttributeType
 from .declare import (
     EXTERNAL_TYPES,
     NATIVE_TYPES,
@@ -15,6 +16,36 @@
 )
 from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types
 
+
+class _MissingType(AttributeType):
+    """Placeholder for missing/unregistered attribute types. Raises error on use."""
+
+    def __init__(self, name: str):
+        self._name = name
+
+    @property
+    def type_name(self) -> str:
+        return self._name
+
+    @property
+    def dtype(self) -> str:
+        raise DataJointError(
+            f"Attribute type <{self._name}> is not registered. "
+            "Register it with @dj.register_type or include it in the schema context."
+        )
+
+    def encode(self, value, *, key=None):
+        raise DataJointError(
+            f"Attribute type <{self._name}> is not registered. "
+            "Register it with @dj.register_type or include it in the schema context."
+        )
+
+    def decode(self, stored, *, key=None):
+        raise DataJointError(
+            f"Attribute type <{self._name}> is not registered. "
+            "Register it with @dj.register_type or include it in the schema context."
+        )
+
 logger = logging.getLogger(__name__.split(".")[0])
 
 default_attribute_properties = dict(  # these default values are set in computed attributes
@@ -279,7 +310,7 @@ def _init_from_database(self):
             if special:
                 special = special.groupdict()
                 attr.update(special)
-            # process adapted attribute types
+            # process custom attribute types (adapted types)
             if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]):
                 assert context is not None, "Declaration context is not set"
                 adapter_name = special["type"]
@@ -287,14 +318,12 @@ def _init_from_database(self):
                     attr.update(adapter=get_adapter(context, adapter_name))
                 except DataJointError:
                     # if no adapter, then delay the error until the first invocation
-                    attr.update(adapter=AttributeAdapter())
+                    attr.update(adapter=_MissingType(adapter_name))
                 else:
-                    attr.update(type=attr["adapter"].attribute_type)
+                    attr.update(type=attr["adapter"].dtype)
                     if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()):
                         raise DataJointError(
-                            "Invalid attribute type '{type}' in adapter object <{adapter_name}>.".format(
-                                adapter_name=adapter_name, **attr
-                            )
+                            f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>."
                         )
                     special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES)
 
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index a8a52c3e0..20f579225 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -726,7 +726,9 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False):
             return None
         attr = self.heading[name]
         if attr.adapter:
-            value = attr.adapter.put(value)
+            # Custom attribute type: validate and encode
+            attr.adapter.validate(value)
+            value = attr.adapter.encode(value, key=None)
         if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))):
             # set default value
             placeholder, value = "DEFAULT", None
diff --git a/tests/conftest.py b/tests/conftest.py
index 8a6ba4057..37241de86 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,7 +16,6 @@
 
 import datajoint as dj
 from datajoint.errors import (
-    ADAPTED_TYPE_SWITCH,
     FILEPATH_FEATURE_SWITCH,
     DataJointError,
 )
@@ -334,10 +333,14 @@ def monkeymodule():
 
 
 @pytest.fixture
-def enable_adapted_types(monkeypatch):
-    monkeypatch.setenv(ADAPTED_TYPE_SWITCH, "TRUE")
+def enable_adapted_types():
+    """
+    Deprecated fixture - custom attribute types no longer require a feature flag.
+
+    This fixture is kept for backward compatibility but does nothing.
+    Custom types are now enabled by default via the AttributeType system.
+    """
     yield
-    monkeypatch.delenv(ADAPTED_TYPE_SWITCH, raising=True)
 
 
 @pytest.fixture
diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py
index 1060a50ed..0b4285ffb 100644
--- a/tests/test_adapted_attributes.py
+++ b/tests/test_adapted_attributes.py
@@ -1,3 +1,10 @@
+"""
+Tests for adapted/custom attribute types.
+
+These tests use the legacy AttributeAdapter API for backward compatibility testing.
+"""
+
+import warnings
 from itertools import zip_longest
 
 import networkx as nx
@@ -8,6 +15,9 @@
 from . import schema_adapted
 from .schema_adapted import Connectivity, Layout
 
+# Filter deprecation warnings from legacy AttributeAdapter usage in these tests
+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
 
 @pytest.fixture
 def schema_name(prefix):
@@ -16,24 +26,28 @@ def schema_name(prefix):
 
 @pytest.fixture
 def adapted_graph_instance():
-    yield schema_adapted.GraphAdapter()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        yield schema_adapted.GraphAdapter()
 
 
 @pytest.fixture
 def schema_ad(
     connection_test,
     adapted_graph_instance,
-    enable_adapted_types,
     enable_filepath_feature,
     s3_creds,
     tmpdir,
     schema_name,
 ):
     dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))}
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        layout_adapter = schema_adapted.LayoutToFilepath()
     context = {
         **schema_adapted.LOCALS_ADAPTED,
         "graph": adapted_graph_instance,
-        "layout_to_filepath": schema_adapted.LayoutToFilepath(),
+        "layout_to_filepath": layout_adapter,
     }
     schema = dj.schema(schema_name, context=context, connection=connection_test)
     schema(schema_adapted.Connectivity)
@@ -92,7 +106,7 @@ def test_adapted_filepath_type(schema_ad, minio_client):
     c.delete()
 
 
-def test_adapted_spawned(local_schema, enable_adapted_types):
+def test_adapted_spawned(local_schema):
     c = Connectivity()  # a spawned class
     graphs = [
         nx.lollipop_graph(4, 2),
diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py
new file mode 100644
index 000000000..294b7eee8
--- /dev/null
+++ b/tests/test_attribute_type.py
@@ -0,0 +1,347 @@
+"""
+Tests for the new AttributeType system.
+"""
+
+import pytest
+
+import datajoint as dj
+from datajoint.attribute_type import (
+    AttributeType,
+    _type_registry,
+    get_type,
+    is_type_registered,
+    list_types,
+    register_type,
+    resolve_dtype,
+    unregister_type,
+)
+from datajoint.errors import DataJointError
+
+
+class TestAttributeTypeRegistry:
+    """Tests for the type registry functionality."""
+
+    def setup_method(self):
+        """Clear any test types from registry before each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def teardown_method(self):
+        """Clean up test types after each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def test_register_type_decorator(self):
+        """Test registering a type using the decorator."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_decorator"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        assert is_type_registered("test_decorator")
+        assert get_type("test_decorator").type_name == "test_decorator"
+
+    def test_register_type_direct(self):
+        """Test registering a type by calling register_type directly."""
+
+        class TestType(AttributeType):
+            type_name = "test_direct"
+            dtype = "varchar(255)"
+
+            def encode(self, value, *, key=None):
+                return str(value)
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        register_type(TestType)
+        assert is_type_registered("test_direct")
+
+    def test_register_type_idempotent(self):
+        """Test that registering the same type twice is idempotent."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_idempotent"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        # Second registration should not raise
+        register_type(TestType)
+        assert is_type_registered("test_idempotent")
+
+    def test_register_duplicate_name_different_class(self):
+        """Test that registering different classes with same name raises error."""
+
+        @register_type
+        class TestType1(AttributeType):
+            type_name = "test_duplicate"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        class TestType2(AttributeType):
+            type_name = "test_duplicate"
+            dtype = "varchar(100)"
+
+            def encode(self, value, *, key=None):
+                return str(value)
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        with pytest.raises(DataJointError, match="already registered"):
+            register_type(TestType2)
+
+    def test_unregister_type(self):
+        """Test unregistering a type."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_unregister"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        assert is_type_registered("test_unregister")
+        unregister_type("test_unregister")
+        assert not is_type_registered("test_unregister")
+
+    def test_get_type_not_found(self):
+        """Test that getting an unregistered type raises error."""
+        with pytest.raises(DataJointError, match="Unknown attribute type"):
+            get_type("nonexistent_type")
+
+    def test_list_types(self):
+        """Test listing registered types."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_list"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        types = list_types()
+        assert "test_list" in types
+        assert types == sorted(types)  # Should be sorted
+
+    def test_get_type_strips_brackets(self):
+        """Test that get_type accepts names with or without angle brackets."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_brackets"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        assert get_type("test_brackets") is get_type("<test_brackets>")
+
+
+class TestAttributeTypeValidation:
+    """Tests for the validate method."""
+
+    def setup_method(self):
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def teardown_method(self):
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def test_validate_called_default(self):
+        """Test that default validate accepts any value."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_validate_default"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        t = get_type("test_validate_default")
+        # Default validate should not raise for any value
+        t.validate(None)
+        t.validate(42)
+        t.validate("string")
+        t.validate([1, 2, 3])
+
+    def test_validate_custom(self):
+        """Test custom validation logic."""
+
+        @register_type
+        class PositiveIntType(AttributeType):
+            type_name = "test_positive_int"
+            dtype = "int"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+            def validate(self, value):
+                if not isinstance(value, int):
+                    raise TypeError(f"Expected int, got {type(value).__name__}")
+                if value < 0:
+                    raise ValueError("Value must be positive")
+
+        t = get_type("test_positive_int")
+        t.validate(42)  # Should pass
+
+        with pytest.raises(TypeError):
+            t.validate("not an int")
+
+        with pytest.raises(ValueError):
+            t.validate(-1)
+
+
+class TestTypeChaining:
+    """Tests for type chaining (dtype referencing another custom type)."""
+
+    def setup_method(self):
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def teardown_method(self):
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def test_resolve_native_dtype(self):
+        """Test resolving a native dtype."""
+        final_dtype, chain = resolve_dtype("longblob")
+        assert final_dtype == "longblob"
+        assert chain == []
+
+    def test_resolve_custom_dtype(self):
+        """Test resolving a custom dtype."""
+
+        @register_type
+        class TestType(AttributeType):
+            type_name = "test_resolve"
+            dtype = "varchar(100)"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        final_dtype, chain = resolve_dtype("<test_resolve>")
+        assert final_dtype == "varchar(100)"
+        assert len(chain) == 1
+        assert chain[0].type_name == "test_resolve"
+
+    def test_resolve_chained_dtype(self):
+        """Test resolving a chained dtype."""
+
+        @register_type
+        class InnerType(AttributeType):
+            type_name = "test_inner"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class OuterType(AttributeType):
+            type_name = "test_outer"
+            dtype = "<test_inner>"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        final_dtype, chain = resolve_dtype("<test_outer>")
+        assert final_dtype == "longblob"
+        assert len(chain) == 2
+        assert chain[0].type_name == "test_outer"
+        assert chain[1].type_name == "test_inner"
+
+    def test_circular_reference_detection(self):
+        """Test that circular type references are detected."""
+
+        @register_type
+        class TypeA(AttributeType):
+            type_name = "test_circular_a"
+            dtype = "<test_circular_b>"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class TypeB(AttributeType):
+            type_name = "test_circular_b"
+            dtype = "<test_circular_a>"
+
+            def encode(self, value, *, key=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        with pytest.raises(DataJointError, match="Circular type reference"):
+            resolve_dtype("<test_circular_a>")
+
+
+class TestExportsAndAPI:
+    """Test that the public API is properly exported."""
+
+    def test_exports_from_datajoint(self):
+        """Test that AttributeType and helpers are exported from datajoint."""
+        assert hasattr(dj, "AttributeType")
+        assert hasattr(dj, "register_type")
+        assert hasattr(dj, "list_types")
+
+    def test_attribute_adapter_deprecated(self):
+        """Test that AttributeAdapter is still available but deprecated."""
+        assert hasattr(dj, "AttributeAdapter")
+        # AttributeAdapter should be a subclass of AttributeType
+        assert issubclass(dj.AttributeAdapter, dj.AttributeType)

From 055c9c6d4fa7ad7a75a576bff85211e8f27a62cd Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 02:21:30 +0000
Subject: [PATCH 02/41] Update documentation for new AttributeType system

- Rewrite customtype.md with comprehensive documentation:
  - Overview of encode/decode pattern
  - Required components (type_name, dtype, encode, decode)
  - Type registration with @dj.register_type decorator
  - Validation with validate() method
  - Storage types (dtype options)
  - Type chaining for composable types
  - Key parameter for context-aware encoding
  - Entry point packages for distribution
  - Complete neuroscience example
  - Migration guide from AttributeAdapter
  - Best practices

- Update attributes.md to reference custom types
---
 docs/src/design/tables/attributes.md |   4 +
 docs/src/design/tables/customtype.md | 474 ++++++++++++++++++++++++---
 2 files changed, 440 insertions(+), 38 deletions(-)

diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md
index 9363e527f..4f8a0644e 100644
--- a/docs/src/design/tables/attributes.md
+++ b/docs/src/design/tables/attributes.md
@@ -77,6 +77,10 @@ sending/receiving an opaque data file to/from a DataJoint pipeline.
 - `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files
 into a DataJoint pipeline.
 
+- `<custom_type>`: a [custom attribute type](customtype.md) that defines bidirectional
+conversion between Python objects and database storage formats. Use this to store
+complex data types like graphs, domain-specific objects, or custom data structures.
+
 ## Numeric type aliases
 
 DataJoint provides convenient type aliases that map to standard MySQL numeric types.
diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md
index aad194ff5..43a168358 100644
--- a/docs/src/design/tables/customtype.md
+++ b/docs/src/design/tables/customtype.md
@@ -1,4 +1,4 @@
-# Custom Types
+# Custom Attribute Types
 
 In modern scientific research, data pipelines often involve complex workflows that
 generate diverse data types. From high-dimensional imaging data to machine learning
@@ -12,69 +12,467 @@ traditional relational databases. For example:
 + Computational biologists might store fitted machine learning models or parameter
   objects for downstream predictions.
 
-To handle these diverse needs, DataJoint provides the `dj.AttributeAdapter` method. It
+To handle these diverse needs, DataJoint provides the **AttributeType** system. It
 enables researchers to store and retrieve complex, non-standard data types—like Python
 objects or data structures—in a relational database while maintaining the
 reproducibility, modularity, and query capabilities required for scientific workflows.
 
-## Uses in Scientific Research
+## Overview
 
-Imagine a neuroscience lab studying neural connectivity. Researchers might generate
-graphs (e.g., networkx.Graph) to represent connections between brain regions, where:
+Custom attribute types define bidirectional conversion between:
 
-+ Nodes are brain regions.
-+ Edges represent connections weighted by signal strength or another metric.
+- **Python objects** (what your code works with)
+- **Storage format** (what gets stored in the database)
 
-Storing these graph objects in a database alongside other experimental data (e.g.,
-subject metadata, imaging parameters) ensures:
-
-1. Centralized Data Management: All experimental data and analysis results are stored
-   together for easy access and querying.
-2. Reproducibility: The exact graph objects used in analysis can be retrieved later for
-   validation or further exploration.
-3. Scalability: Graph data can be integrated into workflows for larger datasets or
-   across experiments.
-
-However, since graphs are not natively supported by relational databases, here’s where
-`dj.AttributeAdapter` becomes essential. It allows researchers to define custom logic for
-serializing graphs (e.g., as edge lists) and deserializing them back into Python
-objects, bridging the gap between advanced data types and the database.
+```
+┌─────────────────┐     encode()      ┌─────────────────┐
+│  Python Object  │ ───────────────►  │  Storage Type   │
+│   (e.g. Graph)  │                   │  (e.g. blob)    │
+└─────────────────┘     decode()      └─────────────────┘
+                    ◄───────────────
+```
 
-### Example: Storing Graphs in DataJoint
+## Defining Custom Types
 
-To store a networkx.Graph object in a DataJoint table, researchers can define a custom
-attribute type in a datajoint table class:
+Create a custom type by subclassing `dj.AttributeType` and implementing the required
+methods:
 
 ```python
 import datajoint as dj
+import networkx as nx
 
-class GraphAdapter(dj.AttributeAdapter):
+@dj.register_type
+class GraphType(dj.AttributeType):
+    """Custom type for storing networkx graphs."""
 
-    attribute_type = 'longblob'   # this is how the attribute will be declared
+    # Required: unique identifier used in table definitions
+    type_name = "graph"
 
-    def put(self, obj):
-        # convert the nx.Graph object  into an edge list
-        assert isinstance(obj, nx.Graph)
-        return list(obj.edges)
+    # Required: underlying DataJoint storage type
+    dtype = "longblob"
 
-    def get(self, value):
-        # convert edge list back into an nx.Graph
-        return nx.Graph(value)
+    def encode(self, graph, *, key=None):
+        """Convert graph to storable format (called on INSERT)."""
+        return list(graph.edges)
 
+    def decode(self, edges, *, key=None):
+        """Convert stored data back to graph (called on FETCH)."""
+        return nx.Graph(edges)
+```
 
-# instantiate for use as a datajoint type
-graph = GraphAdapter()
+### Required Components
 
+| Component | Description |
+|-----------|-------------|
+| `type_name` | Unique identifier used in table definitions with `<type_name>` syntax |
+| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) |
+| `encode(value, *, key=None)` | Converts Python object to storable format |
+| `decode(stored, *, key=None)` | Converts stored data back to Python object |
 
-# define a table with a graph attribute
-schema = dj.schema('test_graphs')
+### Using Custom Types in Tables
 
+Once registered, use the type in table definitions with angle brackets:
 
+```python
 @schema
 class Connectivity(dj.Manual):
     definition = """
     conn_id : int
     ---
-    conn_graph = null : <graph>  # a networkx.Graph object
+    conn_graph = null : <graph>  # Uses the GraphType we defined
     """
 ```
+
+Insert and fetch work seamlessly:
+
+```python
+import networkx as nx
+
+# Insert - encode() is called automatically
+g = nx.lollipop_graph(4, 2)
+Connectivity.insert1({"conn_id": 1, "conn_graph": g})
+
+# Fetch - decode() is called automatically
+result = (Connectivity & "conn_id = 1").fetch1("conn_graph")
+assert isinstance(result, nx.Graph)
+```
+
+## Type Registration
+
+### Decorator Registration
+
+The simplest way to register a type is with the `@dj.register_type` decorator:
+
+```python
+@dj.register_type
+class MyType(dj.AttributeType):
+    type_name = "my_type"
+    ...
+```
+
+### Direct Registration
+
+You can also register types explicitly:
+
+```python
+class MyType(dj.AttributeType):
+    type_name = "my_type"
+    ...
+
+dj.register_type(MyType)
+```
+
+### Listing Registered Types
+
+```python
+# List all registered type names
+print(dj.list_types())
+```
+
+## Validation
+
+Add data validation by overriding the `validate()` method. It's called automatically
+before `encode()` during INSERT operations:
+
+```python
+@dj.register_type
+class PositiveArrayType(dj.AttributeType):
+    type_name = "positive_array"
+    dtype = "longblob"
+
+    def validate(self, value):
+        """Ensure all values are positive."""
+        import numpy as np
+        if not isinstance(value, np.ndarray):
+            raise TypeError(f"Expected numpy array, got {type(value).__name__}")
+        if np.any(value < 0):
+            raise ValueError("Array must contain only positive values")
+
+    def encode(self, array, *, key=None):
+        return array
+
+    def decode(self, stored, *, key=None):
+        return stored
+```
+
+## Storage Types (dtype)
+
+The `dtype` property specifies how data is stored in the database:
+
+| dtype | Use Case | Stored Format |
+|-------|----------|---------------|
+| `"longblob"` | Complex Python objects, arrays | Serialized binary |
+| `"blob"` | Smaller objects | Serialized binary |
+| `"json"` | JSON-serializable data | JSON string |
+| `"varchar(N)"` | String representations | Text |
+| `"int"` | Integer identifiers | Integer |
+| `"blob@store"` | Large objects in external storage | UUID reference |
+| `"object"` | Files/folders in object storage | JSON metadata |
+| `"<other_type>"` | Chain to another custom type | Varies |
+
+### External Storage
+
+For large data, use external blob storage:
+
+```python
+@dj.register_type
+class LargeArrayType(dj.AttributeType):
+    type_name = "large_array"
+    dtype = "blob@mystore"  # Uses external store named "mystore"
+
+    def encode(self, array, *, key=None):
+        return array
+
+    def decode(self, stored, *, key=None):
+        return stored
+```
+
+## Type Chaining
+
+Custom types can build on other custom types by referencing them in `dtype`:
+
+```python
+@dj.register_type
+class CompressedGraphType(dj.AttributeType):
+    type_name = "compressed_graph"
+    dtype = "<graph>"  # Chain to the GraphType
+
+    def encode(self, graph, *, key=None):
+        # Compress before passing to GraphType
+        return self._compress(graph)
+
+    def decode(self, stored, *, key=None):
+        # GraphType's decode already ran
+        return self._decompress(stored)
+```
+
+DataJoint automatically resolves the chain to find the final storage type.
+
+## The Key Parameter
+
+The `key` parameter provides access to primary key values during encode/decode
+operations. This is useful when the conversion depends on record context:
+
+```python
+@dj.register_type
+class ContextAwareType(dj.AttributeType):
+    type_name = "context_aware"
+    dtype = "longblob"
+
+    def encode(self, value, *, key=None):
+        if key and key.get("version") == 2:
+            return self._encode_v2(value)
+        return self._encode_v1(value)
+
+    def decode(self, stored, *, key=None):
+        if key and key.get("version") == 2:
+            return self._decode_v2(stored)
+        return self._decode_v1(stored)
+```
+
+## Publishing Custom Types as Packages
+
+Custom types can be distributed as installable packages using Python entry points.
+This allows types to be automatically discovered when the package is installed.
+
+### Package Structure
+
+```
+dj-graph-types/
+├── pyproject.toml
+└── src/
+    └── dj_graph_types/
+        ├── __init__.py
+        └── types.py
+```
+
+### pyproject.toml
+
+```toml
+[project]
+name = "dj-graph-types"
+version = "1.0.0"
+
+[project.entry-points."datajoint.types"]
+graph = "dj_graph_types.types:GraphType"
+weighted_graph = "dj_graph_types.types:WeightedGraphType"
+```
+
+### Type Implementation
+
+```python
+# src/dj_graph_types/types.py
+import datajoint as dj
+import networkx as nx
+
+class GraphType(dj.AttributeType):
+    type_name = "graph"
+    dtype = "longblob"
+
+    def encode(self, graph, *, key=None):
+        return list(graph.edges)
+
+    def decode(self, edges, *, key=None):
+        return nx.Graph(edges)
+
+class WeightedGraphType(dj.AttributeType):
+    type_name = "weighted_graph"
+    dtype = "longblob"
+
+    def encode(self, graph, *, key=None):
+        return [(u, v, d) for u, v, d in graph.edges(data=True)]
+
+    def decode(self, edges, *, key=None):
+        g = nx.Graph()
+        g.add_weighted_edges_from(edges)
+        return g
+```
+
+### Usage After Installation
+
+```bash
+pip install dj-graph-types
+```
+
+```python
+# Types are automatically available after package installation
+@schema
+class MyTable(dj.Manual):
+    definition = """
+    id : int
+    ---
+    network : <graph>
+    weighted_network : <weighted_graph>
+    """
+```
+
+## Complete Example
+
+Here's a complete example demonstrating custom types for a neuroscience workflow:
+
+```python
+import datajoint as dj
+import numpy as np
+
+# Configure DataJoint
+dj.config["database.host"] = "localhost"
+dj.config["database.user"] = "root"
+dj.config["database.password"] = "password"
+
+# Define custom types
+@dj.register_type
+class SpikeTrainType(dj.AttributeType):
+    """Efficient storage for sparse spike timing data."""
+    type_name = "spike_train"
+    dtype = "longblob"
+
+    def validate(self, value):
+        if not isinstance(value, np.ndarray):
+            raise TypeError("Expected numpy array of spike times")
+        if value.ndim != 1:
+            raise ValueError("Spike train must be 1-dimensional")
+        if not np.all(np.diff(value) >= 0):
+            raise ValueError("Spike times must be sorted")
+
+    def encode(self, spike_times, *, key=None):
+        # Store as differences (smaller values, better compression)
+        return np.diff(spike_times, prepend=0).astype(np.float32)
+
+    def decode(self, stored, *, key=None):
+        # Reconstruct original spike times
+        return np.cumsum(stored).astype(np.float64)
+
+
+@dj.register_type
+class WaveformType(dj.AttributeType):
+    """Storage for spike waveform templates with metadata."""
+    type_name = "waveform"
+    dtype = "longblob"
+
+    def encode(self, waveform_dict, *, key=None):
+        return {
+            "data": waveform_dict["data"].astype(np.float32),
+            "sampling_rate": waveform_dict["sampling_rate"],
+            "channel_ids": list(waveform_dict["channel_ids"]),
+        }
+
+    def decode(self, stored, *, key=None):
+        return {
+            "data": stored["data"].astype(np.float64),
+            "sampling_rate": stored["sampling_rate"],
+            "channel_ids": np.array(stored["channel_ids"]),
+        }
+
+
+# Create schema and tables
+schema = dj.schema("ephys_analysis")
+
+@schema
+class Unit(dj.Manual):
+    definition = """
+    unit_id : int
+    ---
+    spike_times : <spike_train>
+    waveform : <waveform>
+    quality : enum('good', 'mua', 'noise')
+    """
+
+
+# Usage
+spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89])
+waveform = {
+    "data": np.random.randn(82, 4),
+    "sampling_rate": 30000,
+    "channel_ids": [10, 11, 12, 13],
+}
+
+Unit.insert1({
+    "unit_id": 1,
+    "spike_times": spike_times,
+    "waveform": waveform,
+    "quality": "good",
+})
+
+# Fetch - automatically decoded
+result = (Unit & "unit_id = 1").fetch1()
+print(f"Spike times: {result['spike_times']}")
+print(f"Waveform shape: {result['waveform']['data'].shape}")
+```
+
+## Migration from AttributeAdapter
+
+The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`:
+
+### Before (deprecated)
+
+```python
+class GraphAdapter(dj.AttributeAdapter):
+    attribute_type = "longblob"
+
+    def put(self, obj):
+        return list(obj.edges)
+
+    def get(self, value):
+        return nx.Graph(value)
+
+# Required context-based registration
+graph = GraphAdapter()
+schema = dj.schema("mydb", context={"graph": graph})
+```
+
+### After (recommended)
+
+```python
+@dj.register_type
+class GraphType(dj.AttributeType):
+    type_name = "graph"
+    dtype = "longblob"
+
+    def encode(self, obj, *, key=None):
+        return list(obj.edges)
+
+    def decode(self, value, *, key=None):
+        return nx.Graph(value)
+
+# Global registration - no context needed
+schema = dj.schema("mydb")
+```
+
+### Key Differences
+
+| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) |
+|--------|-------------------------------|----------------------------|
+| Methods | `put()` / `get()` | `encode()` / `decode()` |
+| Storage type | `attribute_type` | `dtype` |
+| Type name | Variable name in context | `type_name` property |
+| Registration | Context dict per schema | Global `@register_type` decorator |
+| Validation | Manual | Built-in `validate()` method |
+| Distribution | Copy adapter code | Entry point packages |
+| Key access | Not available | Optional `key` parameter |
+
+## Best Practices
+
+1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`)
+
+2. **Select appropriate storage types**: Use `longblob` for complex objects, `json` for simple structures, external storage for large data
+
+3. **Add validation**: Use `validate()` to catch data errors early
+
+4. **Document your types**: Include docstrings explaining the expected input/output formats
+
+5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes
+
+6. **Consider versioning**: If your encoding format might change, include version information
+
+7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs
+
+```python
+def test_graph_type_roundtrip():
+    g = nx.lollipop_graph(4, 2)
+    t = GraphType()
+
+    encoded = t.encode(g)
+    decoded = t.decode(encoded)
+
+    assert set(g.edges) == set(decoded.edges)
+```

From af9bd8dfac0a3e11977ff813bef6865942a6e8ff Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 02:30:59 +0000
Subject: [PATCH 03/41] Apply ruff-format fixes to AttributeType implementation

---
 src/datajoint/attribute_adapter.py | 18 +++++-------------
 src/datajoint/attribute_type.py    |  6 ++----
 src/datajoint/heading.py           |  5 ++---
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
index 5c687bff6..7e49abb5c 100644
--- a/src/datajoint/attribute_adapter.py
+++ b/src/datajoint/attribute_adapter.py
@@ -83,8 +83,7 @@ def dtype(self) -> str:
         attr_type = self.attribute_type
         if attr_type is None:
             raise NotImplementedError(
-                f"{self.__class__.__name__} must define 'attribute_type' "
-                "(or migrate to AttributeType with 'dtype')"
+                f"{self.__class__.__name__} must define 'attribute_type' " "(or migrate to AttributeType with 'dtype')"
             )
         return attr_type
 
@@ -109,9 +108,7 @@ def put(self, obj: Any) -> Any:
         Returns:
             Value to store in the database.
         """
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must implement put() or migrate to encode()"
-        )
+        raise NotImplementedError(f"{self.__class__.__name__} must implement put() or migrate to encode()")
 
     def get(self, value: Any) -> Any:
         """
@@ -126,9 +123,7 @@ def get(self, value: Any) -> Any:
         Returns:
             Object of the adapted type.
         """
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must implement get() or migrate to decode()"
-        )
+        raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()")
 
 
 def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
@@ -158,8 +153,7 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
     # Fall back to context-based lookup (legacy system)
     if context is None:
         raise DataJointError(
-            f"Attribute type <{adapter_name}> is not registered. "
-            "Use @dj.register_type to register custom types."
+            f"Attribute type <{adapter_name}> is not registered. " "Use @dj.register_type to register custom types."
         )
 
     try:
@@ -184,8 +178,6 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
     # Validate the dtype/attribute_type
     dtype = adapter.dtype
     if not isinstance(dtype, str) or not re.match(r"^\w", dtype):
-        raise DataJointError(
-            f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>"
-        )
+        raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>")
 
     return adapter
diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index ac524d926..31393b2a9 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -232,8 +232,7 @@ class GraphType(dj.AttributeType):
         existing = _type_registry[name]
         if type(existing) is not cls:
             raise DataJointError(
-                f"Type <{name}> is already registered by "
-                f"{type(existing).__module__}.{type(existing).__name__}"
+                f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}"
             )
         # Same class registered twice - idempotent, no error
         return cls
@@ -290,8 +289,7 @@ def get_type(name: str) -> AttributeType:
         return _type_registry[name]
 
     raise DataJointError(
-        f"Unknown attribute type: <{name}>. "
-        f"Ensure the type is registered via @dj.register_type or installed as a package."
+        f"Unknown attribute type: <{name}>. " f"Ensure the type is registered via @dj.register_type or installed as a package."
     )
 
 
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
index 1e40451ee..6b89b9eb1 100644
--- a/src/datajoint/heading.py
+++ b/src/datajoint/heading.py
@@ -46,6 +46,7 @@ def decode(self, stored, *, key=None):
             "Register it with @dj.register_type or include it in the schema context."
         )
 
+
 logger = logging.getLogger(__name__.split(".")[0])
 
 default_attribute_properties = dict(  # these default values are set in computed attributes
@@ -322,9 +323,7 @@ def _init_from_database(self):
                 else:
                     attr.update(type=attr["adapter"].dtype)
                     if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()):
-                        raise DataJointError(
-                            f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>."
-                        )
+                        raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.")
                     special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES)
 
             if special:

From 9bd37f6675f5eaed047109a01979edb51e035c3a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 02:52:28 +0000
Subject: [PATCH 04/41] Add DJBlobType and migration utilities for blob columns

Introduces `<djblob>` as an explicit AttributeType for DataJoint's
native blob serialization, allowing users to be explicit about
serialization behavior in table definitions.

Key changes:
- Add DJBlobType class with `serializes=True` flag to indicate
  it handles its own serialization (avoiding double pack/unpack)
- Update table.py and fetch.py to respect the `serializes` flag,
  skipping blob.pack/unpack when adapter handles serialization
- Add `dj.migrate` module with utilities for migrating existing
  schemas to use explicit `<djblob>` type declarations
- Add tests for DJBlobType functionality
- Document `<djblob>` type and migration procedure

The migration is metadata-only - blob data format is unchanged.
Existing `longblob` columns continue to work with implicit
serialization for backward compatibility.
---
 docs/src/design/tables/customtype.md | 114 ++++++++++++
 src/datajoint/__init__.py            |   1 +
 src/datajoint/attribute_type.py      | 125 ++++++++++++++
 src/datajoint/fetch.py               |  22 ++-
 src/datajoint/migrate.py             | 249 +++++++++++++++++++++++++++
 src/datajoint/table.py               |   7 +-
 tests/test_attribute_type.py         |  68 ++++++++
 7 files changed, 572 insertions(+), 14 deletions(-)
 create mode 100644 src/datajoint/migrate.py

diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md
index 43a168358..4299df24d 100644
--- a/docs/src/design/tables/customtype.md
+++ b/docs/src/design/tables/customtype.md
@@ -476,3 +476,117 @@ def test_graph_type_roundtrip():
 
     assert set(g.edges) == set(decoded.edges)
 ```
+
+## Built-in Types
+
+DataJoint includes a built-in type for explicit blob serialization:
+
+### `<djblob>` - DataJoint Blob Serialization
+
+The `<djblob>` type provides explicit control over DataJoint's native binary
+serialization. It supports:
+
+- NumPy arrays (compatible with MATLAB)
+- Python dicts, lists, tuples, sets
+- datetime objects, Decimals, UUIDs
+- Nested data structures
+- Optional compression
+
+```python
+@schema
+class ProcessedData(dj.Manual):
+    definition = """
+    data_id : int
+    ---
+    results : <djblob>      # Explicit serialization
+    raw_bytes : longblob    # Backward-compatible (auto-serialized)
+    """
+```
+
+#### When to Use `<djblob>`
+
+- **New tables**: Prefer `<djblob>` for clarity and future-proofing
+- **Custom types**: Use `<djblob>` when your type chains to blob storage
+- **Migration**: Existing `longblob` columns can be migrated to `<djblob>`
+
+#### Backward Compatibility
+
+For backward compatibility, `longblob` columns without an explicit type
+still receive automatic serialization. The behavior is identical to `<djblob>`,
+but using `<djblob>` makes the serialization explicit in your code.
+
+## Schema Migration
+
+When upgrading existing schemas to use explicit type declarations, DataJoint
+provides migration utilities.
+
+### Analyzing Blob Columns
+
+```python
+import datajoint as dj
+
+schema = dj.schema("my_database")
+
+# Check migration status
+status = dj.migrate.check_migration_status(schema)
+print(f"Blob columns: {status['total_blob_columns']}")
+print(f"Already migrated: {status['migrated']}")
+print(f"Pending migration: {status['pending']}")
+```
+
+### Generating Migration SQL
+
+```python
+# Preview migration (dry run)
+result = dj.migrate.migrate_blob_columns(schema, dry_run=True)
+for sql in result['sql_statements']:
+    print(sql)
+```
+
+### Applying Migration
+
+```python
+# Apply migration
+result = dj.migrate.migrate_blob_columns(schema, dry_run=False)
+print(f"Migrated {result['migrated']} columns")
+```
+
+### Migration Details
+
+The migration updates MySQL column comments to include the type declaration.
+This is a **metadata-only** change - the actual blob data format is unchanged.
+
+Before migration:
+- Column: `longblob`
+- Comment: `user comment`
+- Behavior: Auto-serialization (implicit)
+
+After migration:
+- Column: `longblob`
+- Comment: `:<djblob>:user comment`
+- Behavior: Explicit serialization via `<djblob>`
+
+### Updating Table Definitions
+
+After database migration, update your Python table definitions for consistency:
+
+```python
+# Before
+class MyTable(dj.Manual):
+    definition = """
+    id : int
+    ---
+    data : longblob  # stored data
+    """
+
+# After
+class MyTable(dj.Manual):
+    definition = """
+    id : int
+    ---
+    data : <djblob>  # stored data
+    """
+```
+
+Both definitions work identically after migration, but using `<djblob>` makes
+the serialization explicit and documents the intended behavior.
diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py
index feff400bf..0a8492cf1 100644
--- a/src/datajoint/__init__.py
+++ b/src/datajoint/__init__.py
@@ -58,6 +58,7 @@
 ]
 
 from . import errors
+from . import migrate
 from .admin import kill
 from .attribute_adapter import AttributeAdapter
 from .attribute_type import AttributeType, list_types, register_type
diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index 31393b2a9..d9a890a83 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -153,6 +153,10 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any:
         """
         ...
 
+    # Class attribute: If True, encode() produces final binary data (no blob.pack needed)
+    # Override in subclasses that handle their own serialization
+    serializes: bool = False
+
     def validate(self, value: Any) -> None:
         """
         Validate a value before encoding.
@@ -409,3 +413,124 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A
 
     # Not a custom type - return as-is
     return dtype, chain
+
+
+# =============================================================================
+# Built-in Attribute Types
+# =============================================================================
+
+
+class DJBlobType(AttributeType):
+    """
+    Built-in type for DataJoint's native serialization format.
+
+    This type handles serialization of arbitrary Python objects (including NumPy arrays,
+    dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes:
+
+    - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
+    - Optional compression (zlib)
+    - Support for NumPy arrays, datetime objects, UUIDs, and nested structures
+
+    The ``<djblob>`` type is the explicit way to specify DataJoint's serialization.
+    It stores data in a MySQL ``LONGBLOB`` column.
+
+    Example:
+        @schema
+        class ProcessedData(dj.Manual):
+            definition = '''
+            data_id : int
+            ---
+            results : <djblob>      # Explicit DataJoint serialization
+            raw_bytes : longblob    # Raw bytes (no serialization)
+            '''
+
+    Note:
+        For backward compatibility, ``longblob`` columns without an explicit type
+        still use automatic serialization. Use ``<djblob>`` to be explicit about
+        serialization behavior.
+    """
+
+    type_name = "djblob"
+    dtype = "longblob"
+    serializes = True  # This type handles its own serialization
+
+    def encode(self, value: Any, *, key: dict | None = None) -> bytes:
+        """
+        Serialize a Python object to DataJoint's blob format.
+
+        Args:
+            value: Any serializable Python object (dict, list, numpy array, etc.)
+            key: Primary key values (unused for blob serialization).
+
+        Returns:
+            Serialized bytes with protocol header and optional compression.
+        """
+        from . import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """
+        Deserialize DataJoint blob format back to a Python object.
+
+        Args:
+            stored: Serialized blob bytes.
+            key: Primary key values (unused for blob serialization).
+
+        Returns:
+            The deserialized Python object.
+        """
+        from . import blob
+
+        return blob.unpack(stored, squeeze=False)
+
+
+class DJBlobExternalType(AttributeType):
+    """
+    Built-in type for externally-stored DataJoint blobs.
+
+    Similar to ``<djblob>`` but stores data in external blob storage instead
+    of inline in the database. Useful for large objects.
+
+    The store name is specified when defining the column type.
+
+    Example:
+        @schema
+        class LargeData(dj.Manual):
+            definition = '''
+            data_id : int
+            ---
+            large_array : blob@mystore  # External storage with auto-serialization
+            '''
+    """
+
+    # Note: This type isn't directly usable via <djblob_external> syntax
+    # It's used internally when blob@store syntax is detected
+    type_name = "djblob_external"
+    dtype = "blob@store"  # Placeholder - actual store is determined at declaration time
+    serializes = True  # This type handles its own serialization
+
+    def encode(self, value: Any, *, key: dict | None = None) -> bytes:
+        """Serialize a Python object to DataJoint's blob format."""
+        from . import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """Deserialize DataJoint blob format back to a Python object."""
+        from . import blob
+
+        return blob.unpack(stored, squeeze=False)
+
+
+def _register_builtin_types() -> None:
+    """
+    Register DataJoint's built-in attribute types.
+
+    Called automatically during module initialization.
+    """
+    register_type(DJBlobType)
+
+
+# Register built-in types when module is loaded
+_register_builtin_types()
diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index 0cac13632..4dfe42c12 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -88,18 +88,16 @@ def adapt(x):
             safe_write(local_filepath, data.split(b"\0", 1)[1])
         return adapt(str(local_filepath))  # download file from remote store
 
-    return adapt(
-        uuid.UUID(bytes=data)
-        if attr.uuid
-        else (
-            blob.unpack(
-                extern.get(uuid.UUID(bytes=data)) if attr.is_external else data,
-                squeeze=squeeze,
-            )
-            if attr.is_blob
-            else data
-        )
-    )
+    if attr.uuid:
+        return adapt(uuid.UUID(bytes=data))
+    elif attr.is_blob:
+        blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data
+        # Skip unpack if adapter handles its own deserialization
+        if attr.adapter and getattr(attr.adapter, "serializes", False):
+            return attr.adapter.decode(blob_data, key=None)
+        return adapt(blob.unpack(blob_data, squeeze=squeeze))
+    else:
+        return adapt(data)
 
 
 class Fetch:
diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py
new file mode 100644
index 000000000..e463da93a
--- /dev/null
+++ b/src/datajoint/migrate.py
@@ -0,0 +1,249 @@
+"""
+Migration utilities for DataJoint schema updates.
+
+This module provides tools for migrating existing schemas to use the new
+AttributeType system, particularly for upgrading blob columns to use
+explicit `<djblob>` type declarations.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import TYPE_CHECKING
+
+from .errors import DataJointError
+
+if TYPE_CHECKING:
+    from .connection import Connection
+    from .schemas import Schema
+
+logger = logging.getLogger(__name__.split(".")[0])
+
+# Pattern to detect blob types
+BLOB_TYPES = re.compile(r"^(tiny|small|medium|long|)blob$", re.I)
+
+
+def analyze_blob_columns(schema: Schema) -> list[dict]:
+    """
+    Analyze a schema to find blob columns that could be migrated to <djblob>.
+
+    This function identifies blob columns that:
+    1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob)
+    2. Do NOT already have an adapter/type specified in their comment
+
+    Args:
+        schema: The DataJoint schema to analyze.
+
+    Returns:
+        List of dicts with keys:
+            - table_name: Full table name (database.table)
+            - column_name: Name of the blob column
+            - column_type: MySQL column type
+            - current_comment: Current column comment
+            - needs_migration: True if column should be migrated
+
+    Example:
+        >>> import datajoint as dj
+        >>> schema = dj.schema('my_database')
+        >>> columns = dj.migrate.analyze_blob_columns(schema)
+        >>> for col in columns:
+        ...     if col['needs_migration']:
+        ...         print(f"{col['table_name']}.{col['column_name']}")
+    """
+    results = []
+
+    connection = schema.connection
+
+    # Get all tables in the schema
+    tables_query = """
+        SELECT TABLE_NAME
+        FROM information_schema.TABLES
+        WHERE TABLE_SCHEMA = %s
+        AND TABLE_TYPE = 'BASE TABLE'
+        AND TABLE_NAME NOT LIKE '~%%'
+    """
+
+    tables = connection.query(tables_query, args=(schema.database,)).fetchall()
+
+    for (table_name,) in tables:
+        # Get column information for each table
+        columns_query = """
+            SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT
+            FROM information_schema.COLUMNS
+            WHERE TABLE_SCHEMA = %s
+            AND TABLE_NAME = %s
+            AND DATA_TYPE IN ('tinyblob', 'blob', 'mediumblob', 'longblob')
+        """
+
+        columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall()
+
+        for column_name, column_type, comment in columns:
+            # Check if comment already has an adapter type (starts with :type:)
+            has_adapter = comment and comment.startswith(":")
+
+            results.append(
+                {
+                    "table_name": f"{schema.database}.{table_name}",
+                    "column_name": column_name,
+                    "column_type": column_type,
+                    "current_comment": comment or "",
+                    "needs_migration": not has_adapter,
+                }
+            )
+
+    return results
+
+
+def generate_migration_sql(
+    schema: Schema,
+    target_type: str = "djblob",
+    dry_run: bool = True,
+) -> list[str]:
+    """
+    Generate SQL statements to migrate blob columns to use <djblob>.
+
+    This generates ALTER TABLE statements that update column comments to
+    include the `:<djblob>:` prefix, marking them as using explicit
+    DataJoint blob serialization.
+
+    Args:
+        schema: The DataJoint schema to migrate.
+        target_type: The type name to migrate to (default: "djblob").
+        dry_run: If True, only return SQL without executing.
+
+    Returns:
+        List of SQL ALTER TABLE statements.
+
+    Example:
+        >>> sql_statements = dj.migrate.generate_migration_sql(schema)
+        >>> for sql in sql_statements:
+        ...     print(sql)
+
+    Note:
+        This is a metadata-only migration. The actual blob data format
+        remains unchanged - only the column comments are updated to
+        indicate explicit type handling.
+    """
+    columns = analyze_blob_columns(schema)
+    sql_statements = []
+
+    for col in columns:
+        if not col["needs_migration"]:
+            continue
+
+        # Build new comment with type prefix
+        old_comment = col["current_comment"]
+        new_comment = f":<{target_type}>:{old_comment}"
+
+        # Escape special characters for SQL
+        new_comment_escaped = new_comment.replace("\\", "\\\\").replace("'", "\\'")
+
+        # Parse table name
+        db_name, table_name = col["table_name"].split(".")
+
+        # Generate ALTER TABLE statement
+        sql = (
+            f"ALTER TABLE `{db_name}`.`{table_name}` "
+            f"MODIFY COLUMN `{col['column_name']}` {col['column_type']} "
+            f"COMMENT '{new_comment_escaped}'"
+        )
+        sql_statements.append(sql)
+
+    return sql_statements
+
+
+def migrate_blob_columns(
+    schema: Schema,
+    target_type: str = "djblob",
+    dry_run: bool = True,
+) -> dict:
+    """
+    Migrate blob columns in a schema to use explicit <djblob> type.
+
+    This updates column comments in the database to include the type
+    declaration. The data format remains unchanged.
+
+    Args:
+        schema: The DataJoint schema to migrate.
+        target_type: The type name to migrate to (default: "djblob").
+        dry_run: If True, only preview changes without applying.
+
+    Returns:
+        Dict with keys:
+            - analyzed: Number of blob columns analyzed
+            - needs_migration: Number of columns that need migration
+            - migrated: Number of columns migrated (0 if dry_run)
+            - sql_statements: List of SQL statements (executed or to be executed)
+
+    Example:
+        >>> # Preview migration
+        >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True)
+        >>> print(f"Would migrate {result['needs_migration']} columns")
+
+        >>> # Apply migration
+        >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False)
+        >>> print(f"Migrated {result['migrated']} columns")
+
+    Warning:
+        After migration, table definitions should be updated to use
+        `<djblob>` instead of `longblob` for consistency. The migration
+        only updates database metadata; source code changes are manual.
+    """
+    columns = analyze_blob_columns(schema)
+    sql_statements = generate_migration_sql(schema, target_type=target_type)
+
+    result = {
+        "analyzed": len(columns),
+        "needs_migration": sum(1 for c in columns if c["needs_migration"]),
+        "migrated": 0,
+        "sql_statements": sql_statements,
+    }
+
+    if dry_run:
+        logger.info(f"Dry run: would migrate {result['needs_migration']} columns")
+        for sql in sql_statements:
+            logger.info(f"  {sql}")
+        return result
+
+    # Execute migrations
+    connection = schema.connection
+    for sql in sql_statements:
+        try:
+            connection.query(sql)
+            result["migrated"] += 1
+            logger.info(f"Executed: {sql}")
+        except Exception as e:
+            logger.error(f"Failed to execute: {sql}\nError: {e}")
+            raise DataJointError(f"Migration failed: {e}") from e
+
+    logger.info(f"Successfully migrated {result['migrated']} columns")
+    return result
+
+
+def check_migration_status(schema: Schema) -> dict:
+    """
+    Check the migration status of blob columns in a schema.
+
+    Args:
+        schema: The DataJoint schema to check.
+
+    Returns:
+        Dict with keys:
+            - total_blob_columns: Total number of blob columns
+            - migrated: Number of columns with explicit type
+            - pending: Number of columns using implicit serialization
+            - columns: List of column details
+
+    Example:
+        >>> status = dj.migrate.check_migration_status(schema)
+        >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}")
+    """
+    columns = analyze_blob_columns(schema)
+
+    return {
+        "total_blob_columns": len(columns),
+        "migrated": sum(1 for c in columns if not c["needs_migration"]),
+        "pending": sum(1 for c in columns if c["needs_migration"]),
+        "columns": columns,
+    }
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 20f579225..89050bce1 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -742,8 +742,11 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False):
                         raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name))
                 value = value.bytes
             elif attr.is_blob:
-                value = blob.pack(value)
-                value = self.external[attr.store].put(value).bytes if attr.is_external else value
+                # Skip blob.pack if adapter already handles serialization
+                if not (attr.adapter and getattr(attr.adapter, "serializes", False)):
+                    value = blob.pack(value)
+                if attr.is_external:
+                    value = self.external[attr.store].put(value).bytes
             elif attr.is_attachment:
                 attachment_path = Path(value)
                 if attr.is_external:
diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py
index 294b7eee8..9fc7cd86f 100644
--- a/tests/test_attribute_type.py
+++ b/tests/test_attribute_type.py
@@ -345,3 +345,71 @@ def test_attribute_adapter_deprecated(self):
         assert hasattr(dj, "AttributeAdapter")
         # AttributeAdapter should be a subclass of AttributeType
         assert issubclass(dj.AttributeAdapter, dj.AttributeType)
+
+
+class TestDJBlobType:
+    """Tests for the built-in DJBlobType."""
+
+    def test_djblob_is_registered(self):
+        """Test that djblob is automatically registered."""
+        assert is_type_registered("djblob")
+
+    def test_djblob_properties(self):
+        """Test DJBlobType properties."""
+        blob_type = get_type("djblob")
+        assert blob_type.type_name == "djblob"
+        assert blob_type.dtype == "longblob"
+        assert blob_type.serializes is True
+
+    def test_djblob_encode_decode_roundtrip(self):
+        """Test that encode/decode is a proper roundtrip."""
+        import numpy as np
+
+        blob_type = get_type("djblob")
+
+        # Test with various data types
+        test_data = [
+            {"key": "value", "number": 42},
+            [1, 2, 3, 4, 5],
+            np.array([1.0, 2.0, 3.0]),
+            "simple string",
+            (1, 2, 3),
+            None,
+        ]
+
+        for original in test_data:
+            encoded = blob_type.encode(original)
+            assert isinstance(encoded, bytes)
+            decoded = blob_type.decode(encoded)
+            if isinstance(original, np.ndarray):
+                np.testing.assert_array_equal(decoded, original)
+            else:
+                assert decoded == original
+
+    def test_djblob_encode_produces_valid_blob_format(self):
+        """Test that encoded data has valid blob protocol header."""
+        blob_type = get_type("djblob")
+        encoded = blob_type.encode({"test": "data"})
+
+        # Should start with compression prefix or protocol header
+        valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0")
+        assert any(encoded.startswith(p) for p in valid_prefixes)
+
+    def test_djblob_in_list_types(self):
+        """Test that djblob appears in list_types."""
+        types = list_types()
+        assert "djblob" in types
+
+    def test_serializes_flag_prevents_double_pack(self):
+        """Test that serializes=True prevents blob.pack being called twice.
+
+        This is a unit test for the flag itself. Integration test with tables
+        is in test_blob.py or test_adapted_attributes.py.
+        """
+        blob_type = get_type("djblob")
+        assert blob_type.serializes is True
+
+        # Legacy adapters should not have serializes=True
+        # (they rely on blob.pack being called after encode)
+        # AttributeType base class defaults to False
+        assert AttributeType.serializes is False

From c8d8a22d8251bc4730f48baa5036c16363201a3e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 02:57:52 +0000
Subject: [PATCH 05/41] Clarify migration handles all blob type variants

---
 docs/src/design/tables/customtype.md | 6 ++++--
 src/datajoint/migrate.py             | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md
index 4299df24d..4a8a9ae06 100644
--- a/docs/src/design/tables/customtype.md
+++ b/docs/src/design/tables/customtype.md
@@ -556,13 +556,15 @@ print(f"Migrated {result['migrated']} columns")
 The migration updates MySQL column comments to include the type declaration.
 This is a **metadata-only** change - the actual blob data format is unchanged.
 
+All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`.
+
 Before migration:
-- Column: `longblob`
+- Column: `longblob` (or `blob`, `mediumblob`, etc.)
 - Comment: `user comment`
 - Behavior: Auto-serialization (implicit)
 
 After migration:
-- Column: `longblob`
+- Column: `longblob` (unchanged)
 - Comment: `:<djblob>:user comment`
 - Behavior: Explicit serialization via `<djblob>`
 
diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py
index e463da93a..b7c707d3e 100644
--- a/src/datajoint/migrate.py
+++ b/src/datajoint/migrate.py
@@ -32,6 +32,8 @@ def analyze_blob_columns(schema: Schema) -> list[dict]:
     1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob)
     2. Do NOT already have an adapter/type specified in their comment
 
+    All blob size variants are included in the analysis.
+
     Args:
         schema: The DataJoint schema to analyze.
 
@@ -39,7 +41,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]:
         List of dicts with keys:
             - table_name: Full table name (database.table)
             - column_name: Name of the blob column
-            - column_type: MySQL column type
+            - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob)
             - current_comment: Current column comment
             - needs_migration: True if column should be migrated
 
@@ -49,7 +51,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]:
         >>> columns = dj.migrate.analyze_blob_columns(schema)
         >>> for col in columns:
         ...     if col['needs_migration']:
-        ...         print(f"{col['table_name']}.{col['column_name']}")
+        ...         print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})")
     """
     results = []
 

From 61db015f5065862ea420b09b4c51518d86defa0c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 03:03:17 +0000
Subject: [PATCH 06/41] Fix ruff linter errors: add migrate to __all__, remove
 unused import

---
 src/datajoint/__init__.py | 1 +
 src/datajoint/migrate.py  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py
index 0a8492cf1..ef9e59cb0 100644
--- a/src/datajoint/__init__.py
+++ b/src/datajoint/__init__.py
@@ -50,6 +50,7 @@
     "list_types",
     "AttributeAdapter",  # Deprecated, use AttributeType
     "errors",
+    "migrate",
     "DataJointError",
     "key",
     "key_hash",
diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py
index b7c707d3e..696ca380e 100644
--- a/src/datajoint/migrate.py
+++ b/src/datajoint/migrate.py
@@ -15,7 +15,6 @@
 from .errors import DataJointError
 
 if TYPE_CHECKING:
-    from .connection import Connection
     from .schemas import Schema
 
 logger = logging.getLogger(__name__.split(".")[0])

From 78e0d1dc94fb0ba7ca70c9897e64a45158ce8030 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 03:22:20 +0000
Subject: [PATCH 07/41] Remove serializes flag; longblob is now raw bytes

Simplified design:
- Plain longblob columns store/return raw bytes (no serialization)
- <djblob> type handles serialization via encode/decode
- Legacy AttributeAdapter handles blob pack/unpack internally
  for backward compatibility

This eliminates the need for the serializes flag by making
blob serialization the responsibility of the adapter/type,
not the framework. Migration to <djblob> is now required
for existing schemas that rely on implicit serialization.
---
 docs/src/design/tables/customtype.md | 38 +++++++++++++++++++++-------
 src/datajoint/attribute_adapter.py   | 34 ++++++++++++++++++++++---
 src/datajoint/attribute_type.py      | 15 ++++-------
 src/datajoint/fetch.py               |  7 ++---
 src/datajoint/table.py               |  5 ++--
 tests/test_attribute_type.py         | 24 ++++++++++--------
 6 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md
index 4a8a9ae06..7504d5d23 100644
--- a/docs/src/design/tables/customtype.md
+++ b/docs/src/design/tables/customtype.md
@@ -498,22 +498,42 @@ class ProcessedData(dj.Manual):
     definition = """
     data_id : int
     ---
-    results : <djblob>      # Explicit serialization
-    raw_bytes : longblob    # Backward-compatible (auto-serialized)
+    results : <djblob>      # Serialized Python objects
+    raw_bytes : longblob    # Raw bytes (no serialization)
     """
 ```
 
 #### When to Use `<djblob>`
 
-- **New tables**: Prefer `<djblob>` for clarity and future-proofing
-- **Custom types**: Use `<djblob>` when your type chains to blob storage
-- **Migration**: Existing `longblob` columns can be migrated to `<djblob>`
+- **Serialized data**: When storing Python objects (dicts, arrays, etc.)
+- **New tables**: Prefer `<djblob>` for automatic serialization
+- **Migration**: Existing schemas with implicit serialization must migrate
 
-#### Backward Compatibility
+#### Raw Blob Behavior
 
-For backward compatibility, `longblob` columns without an explicit type
-still receive automatic serialization. The behavior is identical to `<djblob>`,
-but using `<djblob>` makes the serialization explicit in your code.
+Plain `longblob` (and other blob variants) columns now store and return
+**raw bytes** without automatic serialization:
+
+```python
+@schema
+class RawData(dj.Manual):
+    definition = """
+    id : int
+    ---
+    raw_bytes : longblob    # Stores/returns raw bytes
+    serialized : <djblob>   # Stores Python objects with serialization
+    """
+
+# Raw bytes - no serialization
+RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}})
+
+row = (RawData & "id=1").fetch1()
+row["raw_bytes"]    # Returns: b"raw binary data"
+row["serialized"]   # Returns: {"key": "value"}
+```
+
+**Important**: Existing schemas that relied on implicit blob serialization
+must be migrated to `<djblob>` to preserve their behavior.
 
 ## Schema Migration
 
diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
index 7e49abb5c..7df566a58 100644
--- a/src/datajoint/attribute_adapter.py
+++ b/src/datajoint/attribute_adapter.py
@@ -15,6 +15,9 @@
 from .attribute_type import AttributeType, get_type, is_type_registered
 from .errors import DataJointError
 
+# Pattern to detect blob types for internal pack/unpack
+_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I)
+
 
 class AttributeAdapter(AttributeType):
     """
@@ -87,12 +90,37 @@ def dtype(self) -> str:
             )
         return attr_type
 
+    def _is_blob_dtype(self) -> bool:
+        """Check if dtype is a blob type requiring pack/unpack."""
+        return bool(_BLOB_PATTERN.match(self.dtype))
+
     def encode(self, value: Any, *, key: dict | None = None) -> Any:
-        """Delegate to legacy put() method."""
-        return self.put(value)
+        """
+        Delegate to legacy put() method, with blob packing if needed.
+
+        Legacy adapters expect blob.pack to be called after put() when
+        the dtype is a blob type. This wrapper handles that automatically.
+        """
+        result = self.put(value)
+        # Legacy adapters expect blob.pack after put() for blob dtypes
+        if self._is_blob_dtype():
+            from . import blob
+
+            result = blob.pack(result)
+        return result
 
     def decode(self, stored: Any, *, key: dict | None = None) -> Any:
-        """Delegate to legacy get() method."""
+        """
+        Delegate to legacy get() method, with blob unpacking if needed.
+
+        Legacy adapters expect blob.unpack to be called before get() when
+        the dtype is a blob type. This wrapper handles that automatically.
+        """
+        # Legacy adapters expect blob.unpack before get() for blob dtypes
+        if self._is_blob_dtype():
+            from . import blob
+
+            stored = blob.unpack(stored)
         return self.get(stored)
 
     def put(self, obj: Any) -> Any:
diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index d9a890a83..9be2d2214 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -153,10 +153,6 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any:
         """
         ...
 
-    # Class attribute: If True, encode() produces final binary data (no blob.pack needed)
-    # Override in subclasses that handle their own serialization
-    serializes: bool = False
-
     def validate(self, value: Any) -> None:
         """
         Validate a value before encoding.
@@ -440,19 +436,19 @@ class ProcessedData(dj.Manual):
             definition = '''
             data_id : int
             ---
-            results : <djblob>      # Explicit DataJoint serialization
+            results : <djblob>      # Serialized Python objects
             raw_bytes : longblob    # Raw bytes (no serialization)
             '''
 
     Note:
-        For backward compatibility, ``longblob`` columns without an explicit type
-        still use automatic serialization. Use ``<djblob>`` to be explicit about
-        serialization behavior.
+        Plain ``longblob`` columns store and return raw bytes without serialization.
+        Use ``<djblob>`` when you need automatic serialization of Python objects.
+        Existing schemas using implicit blob serialization should migrate to ``<djblob>``
+        using ``dj.migrate.migrate_blob_columns()``.
     """
 
     type_name = "djblob"
     dtype = "longblob"
-    serializes = True  # This type handles its own serialization
 
     def encode(self, value: Any, *, key: dict | None = None) -> bytes:
         """
@@ -508,7 +504,6 @@ class LargeData(dj.Manual):
     # It's used internally when blob@store syntax is detected
     type_name = "djblob_external"
     dtype = "blob@store"  # Placeholder - actual store is determined at declaration time
-    serializes = True  # This type handles its own serialization
 
     def encode(self, value: Any, *, key: dict | None = None) -> bytes:
         """Serialize a Python object to DataJoint's blob format."""
diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index 4dfe42c12..73057938d 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -92,10 +92,11 @@ def adapt(x):
         return adapt(uuid.UUID(bytes=data))
     elif attr.is_blob:
         blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data
-        # Skip unpack if adapter handles its own deserialization
-        if attr.adapter and getattr(attr.adapter, "serializes", False):
+        # Adapters (like <djblob>) handle deserialization in decode()
+        # Without adapter, blob columns return raw bytes (no deserialization)
+        if attr.adapter:
             return attr.adapter.decode(blob_data, key=None)
-        return adapt(blob.unpack(blob_data, squeeze=squeeze))
+        return blob_data  # raw bytes
     else:
         return adapt(data)
 
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 89050bce1..52ad32e71 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -742,9 +742,8 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False):
                         raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name))
                 value = value.bytes
             elif attr.is_blob:
-                # Skip blob.pack if adapter already handles serialization
-                if not (attr.adapter and getattr(attr.adapter, "serializes", False)):
-                    value = blob.pack(value)
+                # Adapters (like <djblob>) handle serialization in encode()
+                # Without adapter, blob columns store raw bytes (no serialization)
                 if attr.is_external:
                     value = self.external[attr.store].put(value).bytes
             elif attr.is_attachment:
diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py
index 9fc7cd86f..f8f822a60 100644
--- a/tests/test_attribute_type.py
+++ b/tests/test_attribute_type.py
@@ -359,7 +359,6 @@ def test_djblob_properties(self):
         blob_type = get_type("djblob")
         assert blob_type.type_name == "djblob"
         assert blob_type.dtype == "longblob"
-        assert blob_type.serializes is True
 
     def test_djblob_encode_decode_roundtrip(self):
         """Test that encode/decode is a proper roundtrip."""
@@ -400,16 +399,21 @@ def test_djblob_in_list_types(self):
         types = list_types()
         assert "djblob" in types
 
-    def test_serializes_flag_prevents_double_pack(self):
-        """Test that serializes=True prevents blob.pack being called twice.
+    def test_djblob_handles_serialization(self):
+        """Test that DJBlobType handles serialization internally.
 
-        This is a unit test for the flag itself. Integration test with tables
-        is in test_blob.py or test_adapted_attributes.py.
+        With the new design:
+        - Plain longblob columns store/return raw bytes (no serialization)
+        - <djblob> handles pack/unpack in encode/decode
+        - Legacy AttributeAdapter handles pack/unpack internally for backward compat
         """
         blob_type = get_type("djblob")
-        assert blob_type.serializes is True
 
-        # Legacy adapters should not have serializes=True
-        # (they rely on blob.pack being called after encode)
-        # AttributeType base class defaults to False
-        assert AttributeType.serializes is False
+        # DJBlobType.encode() should produce packed bytes
+        data = {"key": "value"}
+        encoded = blob_type.encode(data)
+        assert isinstance(encoded, bytes)
+
+        # DJBlobType.decode() should unpack back to original
+        decoded = blob_type.decode(encoded)
+        assert decoded == data

From c17335674e0041b1ff5b6397e0d87f7d11eef18d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 03:25:36 +0000
Subject: [PATCH 08/41] Remove unused blob imports from fetch.py and table.py

---
 src/datajoint/fetch.py | 2 +-
 src/datajoint/table.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index 73057938d..147e70b7b 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -10,7 +10,7 @@
 
 from datajoint.condition import Top
 
-from . import blob, hash
+from . import hash
 from .errors import DataJointError
 from .settings import config
 from .utils import safe_write
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 52ad32e71..f3722fdde 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -12,7 +12,6 @@
 import numpy as np
 import pandas
 
-from . import blob
 from .condition import make_condition
 from .declare import alter, declare
 from .errors import (

From 106f859280e956a41bdf5abc04beeb4298d2f979 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 03:29:42 +0000
Subject: [PATCH 09/41] Update docs: use <djblob> for serialized data, longblob
 for raw bytes

---
 docs/src/compute/key-source.md        | 2 +-
 docs/src/compute/make.md              | 4 ++--
 docs/src/compute/populate.md          | 6 +++---
 docs/src/design/integrity.md          | 2 +-
 docs/src/design/tables/attributes.md  | 9 +++++++--
 docs/src/design/tables/customtype.md  | 2 +-
 docs/src/design/tables/master-part.md | 6 +++---
 7 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/src/compute/key-source.md b/docs/src/compute/key-source.md
index 76796ec0c..c9b5d2ce7 100644
--- a/docs/src/compute/key-source.md
+++ b/docs/src/compute/key-source.md
@@ -45,7 +45,7 @@ definition = """
 -> Recording
 ---
 sample_rate : float
-eeg_data : longblob
+eeg_data : <djblob>
 """
 key_source = Recording & 'recording_type = "EEG"'
 ```
diff --git a/docs/src/compute/make.md b/docs/src/compute/make.md
index 1b5569b65..390be3b7b 100644
--- a/docs/src/compute/make.md
+++ b/docs/src/compute/make.md
@@ -152,7 +152,7 @@ class ImageAnalysis(dj.Computed):
     # Complex image analysis results
     -> Image
     ---
-    analysis_result : longblob
+    analysis_result : <djblob>
     processing_time : float
     """
 
@@ -188,7 +188,7 @@ class ImageAnalysis(dj.Computed):
     # Complex image analysis results
     -> Image
     ---
-    analysis_result : longblob
+    analysis_result : <djblob>
     processing_time : float
     """
 
diff --git a/docs/src/compute/populate.md b/docs/src/compute/populate.md
index 45c863f17..91db7b176 100644
--- a/docs/src/compute/populate.md
+++ b/docs/src/compute/populate.md
@@ -40,7 +40,7 @@ class FilteredImage(dj.Computed):
      # Filtered image
      -> Image
      ---
-     filtered_image : longblob
+     filtered_image : <djblob>
      """
 
      def make(self, key):
@@ -196,7 +196,7 @@ class ImageAnalysis(dj.Computed):
     # Complex image analysis results
     -> Image
     ---
-    analysis_result : longblob
+    analysis_result : <djblob>
     processing_time : float
     """
 
@@ -230,7 +230,7 @@ class ImageAnalysis(dj.Computed):
     # Complex image analysis results
     -> Image
     ---
-    analysis_result : longblob
+    analysis_result : <djblob>
     processing_time : float
     """
 
diff --git a/docs/src/design/integrity.md b/docs/src/design/integrity.md
index cb7122755..393103522 100644
--- a/docs/src/design/integrity.md
+++ b/docs/src/design/integrity.md
@@ -142,7 +142,7 @@ definition = """
 -> EEGRecording
 channel_idx : int
 ---
-channel_data : longblob
+channel_data : <djblob>
 """
 ```
 ![doc_1-many](../images/doc_1-many.png){: style="align:center"}
diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md
index 4f8a0644e..c849e85ba 100644
--- a/docs/src/design/tables/attributes.md
+++ b/docs/src/design/tables/attributes.md
@@ -48,9 +48,10 @@ fractional digits.
    Because of its well-defined precision, `decimal` values can be used in equality
    comparison and be included in primary keys.
 
--  `longblob`: arbitrary numeric array (e.g. matrix, image, structure), up to 4
+-  `longblob`: raw binary data, up to 4
 [GiB](http://en.wikipedia.org/wiki/Gibibyte) in size.
-   Numeric arrays are compatible between MATLAB and Python (NumPy).
+   Stores and returns raw bytes without serialization.
+   For serialized Python objects (arrays, dicts, etc.), use `<djblob>` instead.
    The `longblob` and other `blob` datatypes can be configured to store data
    [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax.
 
@@ -71,6 +72,10 @@ info).
 These types abstract certain kinds of non-database data to facilitate use
 together with DataJoint.
 
+- `<djblob>`: DataJoint's native serialization format for Python objects. Supports
+NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with
+MATLAB. See [custom types](customtype.md) for details.
+
 - `attach`: a [file attachment](attach.md) similar to email attachments facillitating
 sending/receiving an opaque data file to/from a DataJoint pipeline.
 
diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md
index 7504d5d23..267e0420b 100644
--- a/docs/src/design/tables/customtype.md
+++ b/docs/src/design/tables/customtype.md
@@ -454,7 +454,7 @@ schema = dj.schema("mydb")
 
 1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`)
 
-2. **Select appropriate storage types**: Use `longblob` for complex objects, `json` for simple structures, external storage for large data
+2. **Select appropriate storage types**: Use `<djblob>` for complex objects, `json` for simple structures, external storage for large data
 
 3. **Add validation**: Use `validate()` to catch data errors early
 
diff --git a/docs/src/design/tables/master-part.md b/docs/src/design/tables/master-part.md
index 629bfb8ab..d0f575e4d 100644
--- a/docs/src/design/tables/master-part.md
+++ b/docs/src/design/tables/master-part.md
@@ -26,8 +26,8 @@ class Segmentation(dj.Computed):
           -> Segmentation
           roi  : smallint   # roi number
           ---
-          roi_pixels  : longblob   #  indices of pixels
-          roi_weights : longblob   #  weights of pixels
+          roi_pixels  : <djblob>   #  indices of pixels
+          roi_weights : <djblob>   #  weights of pixels
           """
 
      def make(self, key):
@@ -101,7 +101,7 @@ definition = """
 -> ElectrodeResponse
 channel: int
 ---
-response: longblob  # response of a channel
+response: <djblob>  # response of a channel
 """
 ```
 

From cab10f69af8ed9df314ce7d2acdd4a3d2f59c59d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:01:22 +0000
Subject: [PATCH 10/41] Add storage types redesign spec

Design document for reimplementing blob, attach, filepath, and object
types as a coherent AttributeType system. Separates storage location
(@store) from encoding behavior.
---
 docs/src/design/tables/storage-types-spec.md | 363 +++++++++++++++++++
 1 file changed, 363 insertions(+)
 create mode 100644 docs/src/design/tables/storage-types-spec.md

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
new file mode 100644
index 000000000..2247164d2
--- /dev/null
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -0,0 +1,363 @@
+# Storage Types Redesign Spec
+
+## Overview
+
+This document proposes a redesign of DataJoint's storage types (`blob`, `attach`, `filepath`, `object`) as a coherent system built on the `AttributeType` base class.
+
+## Current State Analysis
+
+### Existing Types
+
+| Type | DB Column | Storage | Semantics |
+|------|-----------|---------|-----------|
+| `longblob` | LONGBLOB | Internal | Raw bytes |
+| `blob@store` | binary(16) | External | Raw bytes via UUID |
+| `attach` | LONGBLOB | Internal | `filename\0contents` |
+| `attach@store` | binary(16) | External | File via UUID |
+| `filepath@store` | binary(16) | External | Path-addressed file reference |
+| `object` | JSON | External | Managed file/folder with ObjectRef |
+
+### Problems with Current Design
+
+1. **Scattered implementation**: Logic split across `declare.py`, `table.py`, `fetch.py`, `external.py`
+2. **Inconsistent patterns**: Some types use AttributeType, others are hardcoded
+3. **Implicit behaviors**: `longblob` previously auto-serialized, now raw
+4. **Overlapping semantics**: `blob@store` vs `attach@store` unclear
+5. **No internal object type**: `object` always requires external store
+
+## Proposed Architecture
+
+### Core Concepts
+
+1. **Storage Location** (orthogonal to type):
+   - **Internal**: Data stored directly in database column
+   - **External**: Data stored in external storage, UUID reference in database
+
+2. **Content Model** (what the type represents):
+   - **Binary**: Raw bytes (no interpretation)
+   - **Serialized**: Python objects encoded via DJ blob format
+   - **File**: Single file with filename metadata
+   - **Folder**: Directory structure
+   - **Reference**: Pointer to externally-managed file (path-addressed)
+
+3. **AttributeType** handles encoding/decoding between Python values and stored representation
+
+### Type Hierarchy
+
+```
+                    AttributeType (base)
+                          │
+        ┌─────────────────┼─────────────────┐
+        │                 │                 │
+   BinaryType        SerializedType     FileSystemType
+   (passthrough)     (pack/unpack)           │
+        │                 │           ┌──────┴──────┐
+        │                 │           │             │
+    longblob          <djblob>    <attach>    <filepath>
+    longblob@store    <djblob@store>  <attach@store>  filepath@store
+```
+
+### Proposed Types
+
+#### 1. Raw Binary (`longblob`, `blob`, etc.)
+
+**Not an AttributeType** - these are primitive MySQL types.
+
+- Store/return raw bytes without transformation
+- `@store` variant stores externally with content-addressed UUID
+- No encoding/decoding needed
+
+```python
+# Table definition
+class RawData(dj.Manual):
+    definition = """
+    id : int
+    ---
+    data : longblob          # raw bytes in DB
+    large_data : blob@store  # raw bytes externally
+    """
+
+# Usage
+table.insert1({'id': 1, 'data': b'raw bytes', 'large_data': b'large raw bytes'})
+row = (table & 'id=1').fetch1()
+assert row['data'] == b'raw bytes'  # bytes returned
+```
+
+#### 2. Serialized Objects (`<djblob>`)
+
+**AttributeType** with DJ blob serialization.
+
+- Input: Any Python object (arrays, dicts, lists, etc.)
+- Output: Same Python object reconstructed
+- Storage: DJ blob format (mYm/dj0 protocol)
+
+```python
+@dj.register_type
+class DJBlobType(AttributeType):
+    type_name = "djblob"
+    dtype = "longblob"  # or "longblob@store" for external
+
+    def encode(self, value, *, key=None) -> bytes:
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored, *, key=None) -> Any:
+        return blob.unpack(stored)
+```
+
+```python
+# Table definition
+class ProcessedData(dj.Manual):
+    definition = """
+    id : int
+    ---
+    result : <djblob>           # serialized in DB
+    large_result : <djblob@store>  # serialized externally
+    """
+
+# Usage
+table.insert1({'id': 1, 'result': {'array': np.array([1,2,3]), 'meta': 'info'}})
+row = (table & 'id=1').fetch1()
+assert row['result']['meta'] == 'info'  # Python dict returned
+```
+
+#### 3. File Attachments (`<attach>`)
+
+**AttributeType** for file storage with filename preservation.
+
+- Input: File path (string or Path)
+- Output: Local file path after download
+- Storage: File contents with filename metadata
+
+```python
+@dj.register_type
+class AttachType(AttributeType):
+    type_name = "attach"
+    dtype = "longblob"  # or "longblob@store" for external
+
+    # For internal storage
+    def encode(self, filepath, *, key=None) -> bytes:
+        path = Path(filepath)
+        return path.name.encode() + b"\0" + path.read_bytes()
+
+    def decode(self, stored, *, key=None) -> str:
+        filename, contents = stored.split(b"\0", 1)
+        # Download to configured path, return local filepath
+        ...
+```
+
+**Key difference from blob**: Preserves original filename, returns file path not bytes.
+
+```python
+# Table definition
+class Attachments(dj.Manual):
+    definition = """
+    id : int
+    ---
+    config_file : <attach>           # small file in DB
+    data_file : <attach@store>       # large file externally
+    """
+
+# Usage
+table.insert1({'id': 1, 'config_file': '/path/to/config.yaml'})
+row = (table & 'id=1').fetch1()
+# row['config_file'] == '/downloads/config.yaml'  # local path
+```
+
+#### 4. Filepath References (`<filepath>`)
+
+**AttributeType** for tracking externally-managed files.
+
+- Input: File path in staging area
+- Output: Local file path after sync
+- Storage: Path-addressed (UUID = hash of relative path, not contents)
+- Tracks `contents_hash` separately for verification
+
+```python
+@dj.register_type
+class FilepathType(AttributeType):
+    type_name = "filepath"
+    dtype = "binary(16)"  # Always external (UUID reference)
+    requires_store = True  # Must specify @store
+
+    def encode(self, filepath, *, key=None) -> bytes:
+        # Compute UUID from relative path
+        # Track contents_hash separately
+        ...
+
+    def decode(self, uuid_bytes, *, key=None) -> str:
+        # Sync file from remote to local stage
+        # Verify contents_hash
+        # Return local path
+        ...
+```
+
+**Key difference from attach**:
+- Path-addressed (same path = same UUID, even if contents change)
+- Designed for managed file workflows where files may be updated
+- Always external (no internal variant)
+
+```python
+# Table definition
+class ManagedFiles(dj.Manual):
+    definition = """
+    id : int
+    ---
+    data_path : <filepath@store>
+    """
+
+# Usage - file must be in configured stage directory
+table.insert1({'id': 1, 'data_path': '/stage/experiment_001/data.h5'})
+row = (table & 'id=1').fetch1()
+# row['data_path'] == '/local_stage/experiment_001/data.h5'
+```
+
+#### 5. Managed Objects (`<object>`)
+
+**AttributeType** for managed file/folder storage with lazy access.
+
+- Input: File path, folder path, or ObjectRef
+- Output: ObjectRef handle (lazy - no automatic download)
+- Storage: JSON metadata column
+- Supports direct writes (Zarr, HDF5) via fsspec
+
+```python
+@dj.register_type
+class ObjectType(AttributeType):
+    type_name = "object"
+    dtype = "json"
+    requires_store = True  # Must specify @store
+
+    def encode(self, value, *, key=None) -> str:
+        # Upload file/folder to object storage
+        # Return JSON metadata
+        ...
+
+    def decode(self, json_str, *, key=None) -> ObjectRef:
+        # Return ObjectRef handle (no download)
+        ...
+```
+
+```python
+# Table definition
+class LargeData(dj.Manual):
+    definition = """
+    id : int
+    ---
+    zarr_data : <object@store>
+    """
+
+# Usage
+table.insert1({'id': 1, 'zarr_data': '/path/to/data.zarr'})
+row = (table & 'id=1').fetch1()
+ref = row['zarr_data']  # ObjectRef handle
+ref.download('/local/path')  # Explicit download
+# Or direct access via fsspec
+```
+
+### Storage Location Modifier (`@store`)
+
+The `@store` suffix is orthogonal to the type and specifies external storage:
+
+| Type | Without @store | With @store |
+|------|---------------|-------------|
+| `longblob` | Raw bytes in DB | Raw bytes in external store |
+| `<djblob>` | Serialized in DB | Serialized in external store |
+| `<attach>` | File in DB | File in external store |
+| `<filepath>` | N/A (error) | Path reference in external store |
+| `<object>` | N/A (error) | Object in external store |
+
+Implementation:
+- `@store` changes the underlying `dtype` to `binary(16)` (UUID)
+- Creates FK relationship to `~external_{store}` tracking table
+- AttributeType's `encode()`/`decode()` work with the external table transparently
+
+### Extended AttributeType Interface
+
+For types that interact with the filesystem, we extend the base interface:
+
+```python
+class FileSystemType(AttributeType):
+    """Base for types that work with file paths."""
+
+    # Standard interface
+    def encode(self, value, *, key=None) -> bytes | str:
+        """Convert input (path or value) to stored representation."""
+        ...
+
+    def decode(self, stored, *, key=None) -> str:
+        """Convert stored representation to local file path."""
+        ...
+
+    # Extended interface for external storage
+    def upload(self, filepath: Path, external: ExternalTable) -> uuid.UUID:
+        """Upload file to external storage, return UUID."""
+        ...
+
+    def download(self, uuid: uuid.UUID, external: ExternalTable,
+                 download_path: Path) -> Path:
+        """Download from external storage to local path."""
+        ...
+```
+
+### Configuration
+
+```python
+# datajoint config
+dj.config['stores'] = {
+    'main': {
+        'protocol': 's3',
+        'endpoint': 's3.amazonaws.com',
+        'bucket': 'my-bucket',
+        'location': 'datajoint/',
+    },
+    'archive': {
+        'protocol': 'file',
+        'location': '/mnt/archive/',
+    }
+}
+
+dj.config['download_path'] = '/tmp/dj_downloads'  # For attach
+dj.config['stage'] = '/data/stage'  # For filepath
+```
+
+## Migration Path
+
+### Phase 1: Current State (Done)
+- `<djblob>` AttributeType implemented
+- `longblob` returns raw bytes
+- Legacy `AttributeAdapter` wrapped for backward compat
+
+### Phase 2: Attach as AttributeType
+- Implement `<attach>` and `<attach@store>` as AttributeType
+- Deprecate bare `attach` type (still works, emits warning)
+- Move logic from table.py/fetch.py to AttachType class
+
+### Phase 3: Filepath as AttributeType
+- Implement `<filepath@store>` as AttributeType
+- Deprecate `filepath@store` syntax (redirect to `<filepath@store>`)
+
+### Phase 4: Object Type Refinement
+- Already implemented as separate system
+- Ensure consistent with AttributeType patterns
+- Consider `<object@store>` syntax
+
+### Phase 5: Cleanup
+- Remove scattered type handling from table.py, fetch.py
+- Consolidate external storage logic
+- Update documentation
+
+## Summary
+
+| Type | Input | Output | Internal | External | Use Case |
+|------|-------|--------|----------|----------|----------|
+| `longblob` | bytes | bytes | ✓ | ✓ | Raw binary data |
+| `<djblob>` | any | any | ✓ | ✓ | Python objects, arrays |
+| `<attach>` | path | path | ✓ | ✓ | Files with filename |
+| `<filepath>` | path | path | ✗ | ✓ | Managed file workflows |
+| `<object>` | path/ref | ObjectRef | ✗ | ✓ | Large files, Zarr, HDF5 |
+
+This design:
+1. Makes all custom types consistent AttributeTypes
+2. Separates storage location (`@store`) from encoding behavior
+3. Provides clear semantics for each type
+4. Enables gradual migration from current implementation

From ecac82de457af638d648465b0c6b4948e6fd9e9f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:20:48 +0000
Subject: [PATCH 11/41] Update storage types spec with OAS integration approach

- Clarify OAS (object type) as distinct system
- Propose storing blob@store/attach@store in OAS _external/ folder
- Content-addressed deduplication via hash stored in varchar(64)
- Propose <ref@store> to replace filepath@store
- Add open questions and implementation phases

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 495 +++++++++----------
 1 file changed, 235 insertions(+), 260 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 2247164d2..79627a990 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -2,100 +2,56 @@
 
 ## Overview
 
-This document proposes a redesign of DataJoint's storage types (`blob`, `attach`, `filepath`, `object`) as a coherent system built on the `AttributeType` base class.
+This document proposes a redesign of DataJoint's storage types as AttributeTypes, with clear separation between:
 
-## Current State Analysis
+1. **Object-Augmented Schemas (OAS)** - New paradigm with managed stores, integrity constraints, and prescribed organization
+2. **Legacy External Storage** - Content-addressed blob/attach storage with deduplication
+3. **Internal Blob Types** - AttributeTypes that serialize into database blob columns
 
-### Existing Types
+## Type Categories
 
-| Type | DB Column | Storage | Semantics |
-|------|-----------|---------|-----------|
-| `longblob` | LONGBLOB | Internal | Raw bytes |
-| `blob@store` | binary(16) | External | Raw bytes via UUID |
-| `attach` | LONGBLOB | Internal | `filename\0contents` |
-| `attach@store` | binary(16) | External | File via UUID |
-| `filepath@store` | binary(16) | External | Path-addressed file reference |
-| `object` | JSON | External | Managed file/folder with ObjectRef |
+### 1. Object-Augmented Schemas (`object`, `object@store`)
 
-### Problems with Current Design
+**Already implemented.** A distinct system where stores are treated as part of the database:
 
-1. **Scattered implementation**: Logic split across `declare.py`, `table.py`, `fetch.py`, `external.py`
-2. **Inconsistent patterns**: Some types use AttributeType, others are hardcoded
-3. **Implicit behaviors**: `longblob` previously auto-serialized, now raw
-4. **Overlapping semantics**: `blob@store` vs `attach@store` unclear
-5. **No internal object type**: `object` always requires external store
-
-## Proposed Architecture
-
-### Core Concepts
-
-1. **Storage Location** (orthogonal to type):
-   - **Internal**: Data stored directly in database column
-   - **External**: Data stored in external storage, UUID reference in database
-
-2. **Content Model** (what the type represents):
-   - **Binary**: Raw bytes (no interpretation)
-   - **Serialized**: Python objects encoded via DJ blob format
-   - **File**: Single file with filename metadata
-   - **Folder**: Directory structure
-   - **Reference**: Pointer to externally-managed file (path-addressed)
-
-3. **AttributeType** handles encoding/decoding between Python values and stored representation
-
-### Type Hierarchy
-
-```
-                    AttributeType (base)
-                          │
-        ┌─────────────────┼─────────────────┐
-        │                 │                 │
-   BinaryType        SerializedType     FileSystemType
-   (passthrough)     (pack/unpack)           │
-        │                 │           ┌──────┴──────┐
-        │                 │           │             │
-    longblob          <djblob>    <attach>    <filepath>
-    longblob@store    <djblob@store>  <attach@store>  filepath@store
-```
-
-### Proposed Types
-
-#### 1. Raw Binary (`longblob`, `blob`, etc.)
-
-**Not an AttributeType** - these are primitive MySQL types.
-
-- Store/return raw bytes without transformation
-- `@store` variant stores externally with content-addressed UUID
-- No encoding/decoding needed
+- Robust integrity constraints
+- Prescribed path organization (derived from primary key)
+- Multiple store support via config
+- Returns `ObjectRef` for lazy access
+- Supports direct writes (Zarr, HDF5) via fsspec
 
 ```python
 # Table definition
-class RawData(dj.Manual):
+class Analysis(dj.Computed):
     definition = """
-    id : int
+    -> Recording
     ---
-    data : longblob          # raw bytes in DB
-    large_data : blob@store  # raw bytes externally
+    results : object@main      # stored in 'main' OAS store
     """
 
 # Usage
-table.insert1({'id': 1, 'data': b'raw bytes', 'large_data': b'large raw bytes'})
-row = (table & 'id=1').fetch1()
-assert row['data'] == b'raw bytes'  # bytes returned
+row = (Analysis & key).fetch1()
+ref = row['results']           # ObjectRef handle (lazy)
+ref.download('/local/path')    # explicit download
+data = ref.open()              # fsspec access
 ```
 
-#### 2. Serialized Objects (`<djblob>`)
+**This type is NOT part of the AttributeType redesign** - it has its own implementation path.
+
+---
 
-**AttributeType** with DJ blob serialization.
+### 2. Serialized Blobs (`<djblob>`)
+
+**Already implemented.** AttributeType for Python object serialization.
 
 - Input: Any Python object (arrays, dicts, lists, etc.)
 - Output: Same Python object reconstructed
-- Storage: DJ blob format (mYm/dj0 protocol)
+- Storage: DJ blob format (mYm/dj0 protocol) in LONGBLOB column
 
 ```python
-@dj.register_type
 class DJBlobType(AttributeType):
     type_name = "djblob"
-    dtype = "longblob"  # or "longblob@store" for external
+    dtype = "longblob"
 
     def encode(self, value, *, key=None) -> bytes:
         return blob.pack(value, compress=True)
@@ -104,260 +60,279 @@ class DJBlobType(AttributeType):
         return blob.unpack(stored)
 ```
 
-```python
-# Table definition
-class ProcessedData(dj.Manual):
-    definition = """
-    id : int
-    ---
-    result : <djblob>           # serialized in DB
-    large_result : <djblob@store>  # serialized externally
-    """
-
-# Usage
-table.insert1({'id': 1, 'result': {'array': np.array([1,2,3]), 'meta': 'info'}})
-row = (table & 'id=1').fetch1()
-assert row['result']['meta'] == 'info'  # Python dict returned
-```
+---
 
-#### 3. File Attachments (`<attach>`)
+### 3. File Attachments (`<attach>`) - TO IMPLEMENT
 
-**AttributeType** for file storage with filename preservation.
+AttributeType for serializing files into internal blob columns.
 
 - Input: File path (string or Path)
-- Output: Local file path after download
-- Storage: File contents with filename metadata
+- Output: Local file path after extraction
+- Storage: `filename\0contents` in LONGBLOB column
 
 ```python
 @dj.register_type
 class AttachType(AttributeType):
     type_name = "attach"
-    dtype = "longblob"  # or "longblob@store" for external
+    dtype = "longblob"
 
-    # For internal storage
     def encode(self, filepath, *, key=None) -> bytes:
         path = Path(filepath)
         return path.name.encode() + b"\0" + path.read_bytes()
 
     def decode(self, stored, *, key=None) -> str:
         filename, contents = stored.split(b"\0", 1)
-        # Download to configured path, return local filepath
-        ...
+        download_path = Path(dj.config['download_path']) / filename
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        download_path.write_bytes(contents)
+        return str(download_path)
 ```
 
-**Key difference from blob**: Preserves original filename, returns file path not bytes.
-
+**Usage:**
 ```python
-# Table definition
-class Attachments(dj.Manual):
+class Configs(dj.Manual):
     definition = """
-    id : int
+    config_id : int
     ---
-    config_file : <attach>           # small file in DB
-    data_file : <attach@store>       # large file externally
+    config_file : <attach>    # file serialized into DB
     """
 
-# Usage
-table.insert1({'id': 1, 'config_file': '/path/to/config.yaml'})
-row = (table & 'id=1').fetch1()
-# row['config_file'] == '/downloads/config.yaml'  # local path
+# Insert
+table.insert1({'config_id': 1, 'config_file': '/path/to/config.yaml'})
+
+# Fetch - file extracted to download_path
+row = (table & 'config_id=1').fetch1()
+local_path = row['config_file']  # '/downloads/config.yaml'
 ```
 
-#### 4. Filepath References (`<filepath>`)
+---
+
+### 4. External Content-Addressed Storage (`<djblob@store>`, `<attach@store>`) - TO DESIGN
+
+These types store content externally with deduplication via content hashing.
 
-**AttributeType** for tracking externally-managed files.
+#### Design Option A: Leverage OAS Stores
 
-- Input: File path in staging area
-- Output: Local file path after sync
-- Storage: Path-addressed (UUID = hash of relative path, not contents)
-- Tracks `contents_hash` separately for verification
+Store content-addressed blobs within OAS stores under a reserved folder:
+
+```
+store_root/
+├── _external/           # Reserved for content-addressed storage
+│   ├── blobs/           # For <djblob@store>
+│   │   └── ab/cd/abcd1234...  # Path derived from content hash
+│   └── attach/          # For <attach@store>
+│       └── ef/gh/efgh5678.../filename.ext
+└── schema_name/         # Normal OAS paths
+    └── table_name/
+        └── pk_value/
+```
+
+**Advantages:**
+- Reuses OAS infrastructure (fsspec, store config)
+- DataJoint fully controls paths
+- Deduplication via content hash
+- No separate `~external_*` tracking tables needed
+
+**Implementation:**
 
 ```python
+class ContentAddressedType(AttributeType):
+    """Base class for content-addressed external storage."""
+
+    subfolder: str  # 'blobs' or 'attach'
+
+    def _content_hash(self, data: bytes) -> str:
+        """Compute content hash for deduplication."""
+        return hashlib.sha256(data).hexdigest()
+
+    def _store_path(self, content_hash: str) -> str:
+        """Generate path within _external folder."""
+        return f"_external/{self.subfolder}/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+
+
 @dj.register_type
-class FilepathType(AttributeType):
-    type_name = "filepath"
-    dtype = "binary(16)"  # Always external (UUID reference)
-    requires_store = True  # Must specify @store
+class DJBlobExternalType(ContentAddressedType):
+    type_name = "djblob"  # Same name, different dtype triggers external
+    dtype = "varchar(64)"  # Store content hash as string
+    subfolder = "blobs"
+
+    def encode(self, value, *, key=None, store=None) -> str:
+        data = blob.pack(value, compress=True)
+        content_hash = self._content_hash(data)
+        path = self._store_path(content_hash)
+        # Upload to store if not exists (deduplication)
+        store.put_if_absent(path, data)
+        return content_hash
+
+    def decode(self, content_hash, *, key=None, store=None) -> Any:
+        path = self._store_path(content_hash)
+        data = store.get(path)
+        return blob.unpack(data)
 
-    def encode(self, filepath, *, key=None) -> bytes:
-        # Compute UUID from relative path
-        # Track contents_hash separately
-        ...
 
-    def decode(self, uuid_bytes, *, key=None) -> str:
-        # Sync file from remote to local stage
-        # Verify contents_hash
-        # Return local path
+@dj.register_type
+class AttachExternalType(ContentAddressedType):
+    type_name = "attach"
+    dtype = "varchar(64)"
+    subfolder = "attach"
+
+    def encode(self, filepath, *, key=None, store=None) -> str:
+        path = Path(filepath)
+        # Hash includes filename for uniqueness
+        data = path.name.encode() + b"\0" + path.read_bytes()
+        content_hash = self._content_hash(data)
+        store_path = self._store_path(content_hash) + "/" + path.name
+        store.put_if_absent(store_path, path.read_bytes())
+        return content_hash
+
+    def decode(self, content_hash, *, key=None, store=None) -> str:
+        # List files in hash folder to get filename
         ...
 ```
 
-**Key difference from attach**:
-- Path-addressed (same path = same UUID, even if contents change)
-- Designed for managed file workflows where files may be updated
-- Always external (no internal variant)
+#### Design Option B: Separate Tracking Tables (Current Approach)
 
-```python
-# Table definition
-class ManagedFiles(dj.Manual):
-    definition = """
-    id : int
-    ---
-    data_path : <filepath@store>
-    """
+Keep `~external_{store}` tables for tracking:
 
-# Usage - file must be in configured stage directory
-table.insert1({'id': 1, 'data_path': '/stage/experiment_001/data.h5'})
-row = (table & 'id=1').fetch1()
-# row['data_path'] == '/local_stage/experiment_001/data.h5'
+```sql
+-- ~external_main
+hash           : binary(16)  # UUID from content hash
+---
+size           : bigint
+attachment_name: varchar(255)  # for attach only
+timestamp      : timestamp
 ```
 
-#### 5. Managed Objects (`<object>`)
+**Disadvantages:**
+- Separate infrastructure from OAS
+- Additional table maintenance
+- More complex cleanup/garbage collection
 
-**AttributeType** for managed file/folder storage with lazy access.
+#### Recommendation
 
-- Input: File path, folder path, or ObjectRef
-- Output: ObjectRef handle (lazy - no automatic download)
-- Storage: JSON metadata column
-- Supports direct writes (Zarr, HDF5) via fsspec
+**Option A (OAS integration)** is cleaner:
+- Single storage paradigm
+- Simpler mental model
+- Content hash stored directly in column (no UUID indirection)
+- Deduplication at storage level
+
+---
+
+### 5. Reference Tracking (`<ref@store>`) - TO DESIGN
+
+Repurpose `filepath@store` as a general reference type, borrowing from ObjRef:
+
+**Current `filepath@store` limitations:**
+- Path-addressed (hash of path, not contents)
+- Requires staging area
+- Archaic copy-to/copy-from model
+
+**Proposed `<ref@store>`:**
+- Track references to external resources
+- Support multiple reference types (file path, URL, object key)
+- Borrow lazy access patterns from ObjRef
+- Optional content verification
 
 ```python
 @dj.register_type
-class ObjectType(AttributeType):
-    type_name = "object"
+class RefType(AttributeType):
+    type_name = "ref"
     dtype = "json"
-    requires_store = True  # Must specify @store
-
-    def encode(self, value, *, key=None) -> str:
-        # Upload file/folder to object storage
-        # Return JSON metadata
-        ...
 
-    def decode(self, json_str, *, key=None) -> ObjectRef:
-        # Return ObjectRef handle (no download)
-        ...
+    def encode(self, value, *, key=None, store=None) -> str:
+        if isinstance(value, str):
+            # Treat as path/URL
+            return json.dumps({
+                'type': 'path',
+                'path': value,
+                'store': store.name,
+                'content_hash': self._compute_hash(value) if verify else None
+            })
+        elif isinstance(value, RefSpec):
+            return json.dumps(value.to_dict())
+
+    def decode(self, json_str, *, key=None, store=None) -> Ref:
+        data = json.loads(json_str)
+        return Ref(data, store=store)
+
+
+class Ref:
+    """Reference handle (similar to ObjectRef)."""
+
+    def __init__(self, data, store):
+        self.path = data['path']
+        self.store = store
+        self._content_hash = data.get('content_hash')
+
+    def download(self, local_path):
+        """Download referenced file."""
+        self.store.download(self.path, local_path)
+        if self._content_hash:
+            self._verify(local_path)
+
+    def open(self, mode='rb'):
+        """Open via fsspec (lazy)."""
+        return self.store.open(self.path, mode)
 ```
 
+**Usage:**
 ```python
-# Table definition
-class LargeData(dj.Manual):
+class ExternalData(dj.Manual):
     definition = """
-    id : int
+    data_id : int
     ---
-    zarr_data : <object@store>
+    source : <ref@archive>    # reference to external file
     """
 
-# Usage
-table.insert1({'id': 1, 'zarr_data': '/path/to/data.zarr'})
-row = (table & 'id=1').fetch1()
-ref = row['zarr_data']  # ObjectRef handle
-ref.download('/local/path')  # Explicit download
-# Or direct access via fsspec
+# Insert - just tracks the reference
+table.insert1({'data_id': 1, 'source': '/archive/experiment_001/data.h5'})
+
+# Fetch - returns Ref handle
+row = (table & 'data_id=1').fetch1()
+ref = row['source']
+ref.download('/local/data.h5')  # explicit download
 ```
 
-### Storage Location Modifier (`@store`)
+---
 
-The `@store` suffix is orthogonal to the type and specifies external storage:
+## Summary of Types
 
-| Type | Without @store | With @store |
-|------|---------------|-------------|
-| `longblob` | Raw bytes in DB | Raw bytes in external store |
-| `<djblob>` | Serialized in DB | Serialized in external store |
-| `<attach>` | File in DB | File in external store |
-| `<filepath>` | N/A (error) | Path reference in external store |
-| `<object>` | N/A (error) | Object in external store |
+| Type | Storage | Column | Input | Output | Dedup |
+|------|---------|--------|-------|--------|-------|
+| `object@store` | OAS store | JSON | path/ref | ObjectRef | By path |
+| `<djblob>` | Internal | LONGBLOB | any | any | No |
+| `<djblob@store>` | OAS `_external/` | varchar(64) | any | any | By content |
+| `<attach>` | Internal | LONGBLOB | path | path | No |
+| `<attach@store>` | OAS `_external/` | varchar(64) | path | path | By content |
+| `<ref@store>` | OAS store | JSON | path/ref | Ref | No (tracks) |
 
-Implementation:
-- `@store` changes the underlying `dtype` to `binary(16)` (UUID)
-- Creates FK relationship to `~external_{store}` tracking table
-- AttributeType's `encode()`/`decode()` work with the external table transparently
+## Open Questions
 
-### Extended AttributeType Interface
+1. **Store syntax**: Should external AttributeTypes use `<djblob@store>` or detect externality from dtype?
 
-For types that interact with the filesystem, we extend the base interface:
+2. **Backward compatibility**: How to handle existing `blob@store` and `attach@store` columns with `~external_*` tables?
 
-```python
-class FileSystemType(AttributeType):
-    """Base for types that work with file paths."""
+3. **Deduplication scope**: Per-store or global across stores?
 
-    # Standard interface
-    def encode(self, value, *, key=None) -> bytes | str:
-        """Convert input (path or value) to stored representation."""
-        ...
+4. **Ref vs filepath**: Deprecate `filepath@store` entirely or keep as alias?
 
-    def decode(self, stored, *, key=None) -> str:
-        """Convert stored representation to local file path."""
-        ...
+5. **Content hash format**: SHA256 hex (64 chars) or shorter hash?
 
-    # Extended interface for external storage
-    def upload(self, filepath: Path, external: ExternalTable) -> uuid.UUID:
-        """Upload file to external storage, return UUID."""
-        ...
+## Implementation Phases
 
-    def download(self, uuid: uuid.UUID, external: ExternalTable,
-                 download_path: Path) -> Path:
-        """Download from external storage to local path."""
-        ...
-```
+### Phase 1: `<attach>` Internal
+- Implement AttachType for internal blob storage
+- Deprecate bare `attach` keyword (still works, warns)
 
-### Configuration
+### Phase 2: Content-Addressed External
+- Implement ContentAddressedType base
+- Add `<djblob@store>` and `<attach@store>`
+- Store in OAS `_external/` folder
 
-```python
-# datajoint config
-dj.config['stores'] = {
-    'main': {
-        'protocol': 's3',
-        'endpoint': 's3.amazonaws.com',
-        'bucket': 'my-bucket',
-        'location': 'datajoint/',
-    },
-    'archive': {
-        'protocol': 'file',
-        'location': '/mnt/archive/',
-    }
-}
-
-dj.config['download_path'] = '/tmp/dj_downloads'  # For attach
-dj.config['stage'] = '/data/stage'  # For filepath
-```
+### Phase 3: Reference Type
+- Implement `<ref@store>` with Ref handle
+- Deprecate `filepath@store`
 
-## Migration Path
-
-### Phase 1: Current State (Done)
-- `<djblob>` AttributeType implemented
-- `longblob` returns raw bytes
-- Legacy `AttributeAdapter` wrapped for backward compat
-
-### Phase 2: Attach as AttributeType
-- Implement `<attach>` and `<attach@store>` as AttributeType
-- Deprecate bare `attach` type (still works, emits warning)
-- Move logic from table.py/fetch.py to AttachType class
-
-### Phase 3: Filepath as AttributeType
-- Implement `<filepath@store>` as AttributeType
-- Deprecate `filepath@store` syntax (redirect to `<filepath@store>`)
-
-### Phase 4: Object Type Refinement
-- Already implemented as separate system
-- Ensure consistent with AttributeType patterns
-- Consider `<object@store>` syntax
-
-### Phase 5: Cleanup
-- Remove scattered type handling from table.py, fetch.py
-- Consolidate external storage logic
-- Update documentation
-
-## Summary
-
-| Type | Input | Output | Internal | External | Use Case |
-|------|-------|--------|----------|----------|----------|
-| `longblob` | bytes | bytes | ✓ | ✓ | Raw binary data |
-| `<djblob>` | any | any | ✓ | ✓ | Python objects, arrays |
-| `<attach>` | path | path | ✓ | ✓ | Files with filename |
-| `<filepath>` | path | path | ✗ | ✓ | Managed file workflows |
-| `<object>` | path/ref | ObjectRef | ✗ | ✓ | Large files, Zarr, HDF5 |
-
-This design:
-1. Makes all custom types consistent AttributeTypes
-2. Separates storage location (`@store`) from encoding behavior
-3. Provides clear semantics for each type
-4. Enables gradual migration from current implementation
+### Phase 4: Migration Tools
+- Tools to migrate `~external_*` data to new format
+- Backward compat layer for reading old format

From 7e7f9682d9336e80f833c4e3f11496609a4482d6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:31:52 +0000
Subject: [PATCH 12/41] Unify external storage under OAS with content-addressed
 region

- All external storage uses OAS infrastructure
- Path-addressed: regular object@store (existing)
- Content-addressed: _content/ folder for <djblob@store>, <attach@store>
- ContentRegistry table for reference counting and GC
- ObjectRef returned for all external types (lazy access)
- Deduplication via SHA256 content hash

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 469 +++++++++----------
 1 file changed, 223 insertions(+), 246 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 79627a990..844564755 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -2,337 +2,314 @@
 
 ## Overview
 
-This document proposes a redesign of DataJoint's storage types as AttributeTypes, with clear separation between:
+This document proposes a unified storage architecture where all external storage uses the Object-Augmented Schema (OAS) paradigm, with a special content-addressable region for deduplicated objects.
 
-1. **Object-Augmented Schemas (OAS)** - New paradigm with managed stores, integrity constraints, and prescribed organization
-2. **Legacy External Storage** - Content-addressed blob/attach storage with deduplication
-3. **Internal Blob Types** - AttributeTypes that serialize into database blob columns
+## Architecture
 
-## Type Categories
+### Two Storage Modes within OAS
 
-### 1. Object-Augmented Schemas (`object`, `object@store`)
+```
+store_root/
+├── {schema}/{table}/{pk}/           # Path-addressed (regular OAS)
+│   └── {attribute}/                 # Derived from primary key
+│       └── ...                      # Files, folders, Zarr, etc.
+│
+└── _content/                        # Content-addressed (deduplicated)
+    └── {hash[:2]}/{hash[2:4]}/
+        └── {hash}/                  # Full SHA256 hash
+            └── ...                  # Object contents
+```
 
-**Already implemented.** A distinct system where stores are treated as part of the database:
+### 1. Path-Addressed Objects (`object@store`)
 
-- Robust integrity constraints
-- Prescribed path organization (derived from primary key)
-- Multiple store support via config
+**Already implemented.** Regular OAS behavior:
+- Path derived from primary key
+- One-to-one relationship with table row
+- Deleted when row is deleted
 - Returns `ObjectRef` for lazy access
-- Supports direct writes (Zarr, HDF5) via fsspec
 
 ```python
-# Table definition
 class Analysis(dj.Computed):
     definition = """
     -> Recording
     ---
-    results : object@main      # stored in 'main' OAS store
+    results : object@main
     """
-
-# Usage
-row = (Analysis & key).fetch1()
-ref = row['results']           # ObjectRef handle (lazy)
-ref.download('/local/path')    # explicit download
-data = ref.open()              # fsspec access
 ```
 
-**This type is NOT part of the AttributeType redesign** - it has its own implementation path.
+### 2. Content-Addressed Objects (`<djblob@store>`, `<attach@store>`)
 
----
-
-### 2. Serialized Blobs (`<djblob>`)
-
-**Already implemented.** AttributeType for Python object serialization.
-
-- Input: Any Python object (arrays, dicts, lists, etc.)
-- Output: Same Python object reconstructed
-- Storage: DJ blob format (mYm/dj0 protocol) in LONGBLOB column
+**New.** Stored in `_content/` region with deduplication:
+- Path derived from content hash (SHA256)
+- Many-to-one: multiple rows can reference same object
+- Reference counted for garbage collection
+- Returns `ObjectRef` for lazy access (same as regular OAS)
 
 ```python
-class DJBlobType(AttributeType):
-    type_name = "djblob"
-    dtype = "longblob"
-
-    def encode(self, value, *, key=None) -> bytes:
-        return blob.pack(value, compress=True)
-
-    def decode(self, stored, *, key=None) -> Any:
-        return blob.unpack(stored)
+class ProcessedData(dj.Computed):
+    definition = """
+    -> RawData
+    ---
+    features : <djblob@main>     # Serialized Python object, deduplicated
+    source_file : <attach@main>  # File attachment, deduplicated
+    """
 ```
 
----
+## Content-Addressed Storage Design
 
-### 3. File Attachments (`<attach>`) - TO IMPLEMENT
-
-AttributeType for serializing files into internal blob columns.
-
-- Input: File path (string or Path)
-- Output: Local file path after extraction
-- Storage: `filename\0contents` in LONGBLOB column
+### Storage Path
 
 ```python
-@dj.register_type
-class AttachType(AttributeType):
-    type_name = "attach"
-    dtype = "longblob"
-
-    def encode(self, filepath, *, key=None) -> bytes:
-        path = Path(filepath)
-        return path.name.encode() + b"\0" + path.read_bytes()
+def content_path(content_hash: str) -> str:
+    """Generate path for content-addressed object."""
+    return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
 
-    def decode(self, stored, *, key=None) -> str:
-        filename, contents = stored.split(b"\0", 1)
-        download_path = Path(dj.config['download_path']) / filename
-        download_path.parent.mkdir(parents=True, exist_ok=True)
-        download_path.write_bytes(contents)
-        return str(download_path)
+# Example: hash "a1b2c3d4..." -> "_content/a1/b2/a1b2c3d4..."
 ```
 
-**Usage:**
+### Reference Registry
+
+A schema-level table tracks content-addressed objects for reference counting:
+
 ```python
-class Configs(dj.Manual):
+class ContentRegistry:
+    """
+    Tracks content-addressed objects for garbage collection.
+    One per schema, created automatically when content-addressed types are used.
+    """
     definition = """
-    config_id : int
+    # Content-addressed object registry
+    content_hash : char(64)          # SHA256 hex
     ---
-    config_file : <attach>    # file serialized into DB
+    store        : varchar(64)       # Store name
+    size         : bigint unsigned   # Object size in bytes
+    created      : timestamp DEFAULT CURRENT_TIMESTAMP
     """
+```
 
-# Insert
-table.insert1({'config_id': 1, 'config_file': '/path/to/config.yaml'})
+### Reference Counting
 
-# Fetch - file extracted to download_path
-row = (table & 'config_id=1').fetch1()
-local_path = row['config_file']  # '/downloads/config.yaml'
-```
+Reference counting is implicit via database queries:
 
----
+```python
+def find_orphans(schema) -> list[tuple[str, str]]:
+    """Find content hashes not referenced by any table."""
+
+    # Get all registered hashes
+    registered = set(ContentRegistry().fetch('content_hash', 'store'))
+
+    # Get all referenced hashes from tables
+    referenced = set()
+    for table in schema.tables:
+        for attr in table.heading.attributes:
+            if attr.is_content_addressed:
+                hashes = table.fetch(attr.name)
+                referenced.update((h, attr.store) for h in hashes)
+
+    return registered - referenced
+
+def garbage_collect(schema):
+    """Remove orphaned content-addressed objects."""
+    for content_hash, store in find_orphans(schema):
+        # Delete from storage
+        store_backend = get_store(store)
+        store_backend.delete(content_path(content_hash))
+        # Delete from registry
+        (ContentRegistry() & {'content_hash': content_hash}).delete()
+```
 
-### 4. External Content-Addressed Storage (`<djblob@store>`, `<attach@store>`) - TO DESIGN
+### ObjectRef for Content-Addressed Objects
 
-These types store content externally with deduplication via content hashing.
+Content-addressed objects return `ObjectRef` just like regular OAS objects:
 
-#### Design Option A: Leverage OAS Stores
+```python
+row = (ProcessedData & key).fetch1()
 
-Store content-addressed blobs within OAS stores under a reserved folder:
+# Both return ObjectRef
+results_ref = row['features']      # <djblob@store>
+file_ref = row['source_file']      # <attach@store>
 
-```
-store_root/
-├── _external/           # Reserved for content-addressed storage
-│   ├── blobs/           # For <djblob@store>
-│   │   └── ab/cd/abcd1234...  # Path derived from content hash
-│   └── attach/          # For <attach@store>
-│       └── ef/gh/efgh5678.../filename.ext
-└── schema_name/         # Normal OAS paths
-    └── table_name/
-        └── pk_value/
+# Same interface as regular OAS
+results_ref.download('/local/path')
+data = results_ref.load()          # For djblob: deserialize
+local_path = file_ref.download()   # For attach: download, return path
 ```
 
-**Advantages:**
-- Reuses OAS infrastructure (fsspec, store config)
-- DataJoint fully controls paths
-- Deduplication via content hash
-- No separate `~external_*` tracking tables needed
+## AttributeType Implementations
 
-**Implementation:**
+### `<djblob>` - Internal Serialized Blob
 
 ```python
-class ContentAddressedType(AttributeType):
-    """Base class for content-addressed external storage."""
-
-    subfolder: str  # 'blobs' or 'attach'
+@dj.register_type
+class DJBlobType(AttributeType):
+    type_name = "djblob"
+    dtype = "longblob"
 
-    def _content_hash(self, data: bytes) -> str:
-        """Compute content hash for deduplication."""
-        return hashlib.sha256(data).hexdigest()
+    def encode(self, value, *, key=None) -> bytes:
+        from . import blob
+        return blob.pack(value, compress=True)
 
-    def _store_path(self, content_hash: str) -> str:
-        """Generate path within _external folder."""
-        return f"_external/{self.subfolder}/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+    def decode(self, stored, *, key=None) -> Any:
+        from . import blob
+        return blob.unpack(stored)
+```
 
+### `<djblob@store>` - External Serialized Blob (Content-Addressed)
 
+```python
 @dj.register_type
-class DJBlobExternalType(ContentAddressedType):
-    type_name = "djblob"  # Same name, different dtype triggers external
-    dtype = "varchar(64)"  # Store content hash as string
-    subfolder = "blobs"
+class DJBlobExternalType(AttributeType):
+    type_name = "djblob"
+    dtype = "char(64)"  # Content hash stored in column
+    is_content_addressed = True
 
     def encode(self, value, *, key=None, store=None) -> str:
+        from . import blob
         data = blob.pack(value, compress=True)
-        content_hash = self._content_hash(data)
-        path = self._store_path(content_hash)
-        # Upload to store if not exists (deduplication)
-        store.put_if_absent(path, data)
+        content_hash = hashlib.sha256(data).hexdigest()
+
+        # Upload if not exists (deduplication)
+        path = content_path(content_hash)
+        if not store.exists(path):
+            store.put(path, data)
+            ContentRegistry().insert1({
+                'content_hash': content_hash,
+                'store': store.name,
+                'size': len(data)
+            })
+
         return content_hash
 
-    def decode(self, content_hash, *, key=None, store=None) -> Any:
-        path = self._store_path(content_hash)
-        data = store.get(path)
-        return blob.unpack(data)
+    def decode(self, content_hash, *, key=None, store=None) -> ObjectRef:
+        # Return ObjectRef for lazy access
+        return ObjectRef(
+            path=content_path(content_hash),
+            store=store,
+            loader=blob.unpack  # Custom loader for deserialization
+        )
+```
 
+### `<attach>` - Internal File Attachment
 
+```python
 @dj.register_type
-class AttachExternalType(ContentAddressedType):
+class AttachType(AttributeType):
     type_name = "attach"
-    dtype = "varchar(64)"
-    subfolder = "attach"
+    dtype = "longblob"
 
-    def encode(self, filepath, *, key=None, store=None) -> str:
+    def encode(self, filepath, *, key=None) -> bytes:
         path = Path(filepath)
-        # Hash includes filename for uniqueness
-        data = path.name.encode() + b"\0" + path.read_bytes()
-        content_hash = self._content_hash(data)
-        store_path = self._store_path(content_hash) + "/" + path.name
-        store.put_if_absent(store_path, path.read_bytes())
-        return content_hash
-
-    def decode(self, content_hash, *, key=None, store=None) -> str:
-        # List files in hash folder to get filename
-        ...
-```
-
-#### Design Option B: Separate Tracking Tables (Current Approach)
-
-Keep `~external_{store}` tables for tracking:
+        return path.name.encode() + b"\0" + path.read_bytes()
 
-```sql
--- ~external_main
-hash           : binary(16)  # UUID from content hash
----
-size           : bigint
-attachment_name: varchar(255)  # for attach only
-timestamp      : timestamp
+    def decode(self, stored, *, key=None) -> str:
+        filename, contents = stored.split(b"\0", 1)
+        filename = filename.decode()
+        download_path = Path(dj.config['download_path']) / filename
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        download_path.write_bytes(contents)
+        return str(download_path)
 ```
 
-**Disadvantages:**
-- Separate infrastructure from OAS
-- Additional table maintenance
-- More complex cleanup/garbage collection
-
-#### Recommendation
-
-**Option A (OAS integration)** is cleaner:
-- Single storage paradigm
-- Simpler mental model
-- Content hash stored directly in column (no UUID indirection)
-- Deduplication at storage level
-
----
-
-### 5. Reference Tracking (`<ref@store>`) - TO DESIGN
-
-Repurpose `filepath@store` as a general reference type, borrowing from ObjRef:
-
-**Current `filepath@store` limitations:**
-- Path-addressed (hash of path, not contents)
-- Requires staging area
-- Archaic copy-to/copy-from model
-
-**Proposed `<ref@store>`:**
-- Track references to external resources
-- Support multiple reference types (file path, URL, object key)
-- Borrow lazy access patterns from ObjRef
-- Optional content verification
+### `<attach@store>` - External File Attachment (Content-Addressed)
 
 ```python
 @dj.register_type
-class RefType(AttributeType):
-    type_name = "ref"
-    dtype = "json"
+class AttachExternalType(AttributeType):
+    type_name = "attach"
+    dtype = "char(64)"  # Content hash stored in column
+    is_content_addressed = True
 
-    def encode(self, value, *, key=None, store=None) -> str:
-        if isinstance(value, str):
-            # Treat as path/URL
-            return json.dumps({
-                'type': 'path',
-                'path': value,
+    def encode(self, filepath, *, key=None, store=None) -> str:
+        path = Path(filepath)
+        data = path.read_bytes()
+        # Hash includes filename for uniqueness
+        content_hash = hashlib.sha256(
+            path.name.encode() + b"\0" + data
+        ).hexdigest()
+
+        # Store as folder with original filename preserved
+        obj_path = content_path(content_hash)
+        if not store.exists(obj_path):
+            store.put(f"{obj_path}/{path.name}", data)
+            ContentRegistry().insert1({
+                'content_hash': content_hash,
                 'store': store.name,
-                'content_hash': self._compute_hash(value) if verify else None
+                'size': len(data)
             })
-        elif isinstance(value, RefSpec):
-            return json.dumps(value.to_dict())
 
-    def decode(self, json_str, *, key=None, store=None) -> Ref:
-        data = json.loads(json_str)
-        return Ref(data, store=store)
+        return content_hash
+
+    def decode(self, content_hash, *, key=None, store=None) -> ObjectRef:
+        return ObjectRef(
+            path=content_path(content_hash),
+            store=store,
+            # ObjectRef handles file download
+        )
+```
 
+## Unified ObjectRef Interface
 
-class Ref:
-    """Reference handle (similar to ObjectRef)."""
+All external storage (both path-addressed and content-addressed) returns `ObjectRef`:
 
-    def __init__(self, data, store):
-        self.path = data['path']
+```python
+class ObjectRef:
+    """Lazy reference to stored object."""
+
+    def __init__(self, path, store, loader=None):
+        self.path = path
         self.store = store
-        self._content_hash = data.get('content_hash')
+        self._loader = loader  # Optional custom deserializer
 
-    def download(self, local_path):
-        """Download referenced file."""
+    def download(self, local_path=None) -> Path:
+        """Download object to local filesystem."""
+        if local_path is None:
+            local_path = Path(dj.config['download_path']) / Path(self.path).name
         self.store.download(self.path, local_path)
-        if self._content_hash:
-            self._verify(local_path)
+        return local_path
+
+    def load(self) -> Any:
+        """Load and optionally deserialize object."""
+        data = self.store.get(self.path)
+        if self._loader:
+            return self._loader(data)
+        return data
 
     def open(self, mode='rb'):
-        """Open via fsspec (lazy)."""
+        """Open via fsspec for streaming access."""
         return self.store.open(self.path, mode)
 ```
 
-**Usage:**
-```python
-class ExternalData(dj.Manual):
-    definition = """
-    data_id : int
-    ---
-    source : <ref@archive>    # reference to external file
-    """
-
-# Insert - just tracks the reference
-table.insert1({'data_id': 1, 'source': '/archive/experiment_001/data.h5'})
-
-# Fetch - returns Ref handle
-row = (table & 'data_id=1').fetch1()
-ref = row['source']
-ref.download('/local/data.h5')  # explicit download
-```
-
----
+## Summary
 
-## Summary of Types
+| Type | Storage | Column | Dedup | Returns |
+|------|---------|--------|-------|---------|
+| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef |
+| `<djblob>` | Internal DB | LONGBLOB | No | Python object |
+| `<djblob@store>` | `_content/{hash}/` | char(64) | Yes | ObjectRef |
+| `<attach>` | Internal DB | LONGBLOB | No | Local path |
+| `<attach@store>` | `_content/{hash}/` | char(64) | Yes | ObjectRef |
 
-| Type | Storage | Column | Input | Output | Dedup |
-|------|---------|--------|-------|--------|-------|
-| `object@store` | OAS store | JSON | path/ref | ObjectRef | By path |
-| `<djblob>` | Internal | LONGBLOB | any | any | No |
-| `<djblob@store>` | OAS `_external/` | varchar(64) | any | any | By content |
-| `<attach>` | Internal | LONGBLOB | path | path | No |
-| `<attach@store>` | OAS `_external/` | varchar(64) | path | path | By content |
-| `<ref@store>` | OAS store | JSON | path/ref | Ref | No (tracks) |
+## Key Design Decisions
 
-## Open Questions
-
-1. **Store syntax**: Should external AttributeTypes use `<djblob@store>` or detect externality from dtype?
-
-2. **Backward compatibility**: How to handle existing `blob@store` and `attach@store` columns with `~external_*` tables?
-
-3. **Deduplication scope**: Per-store or global across stores?
+1. **Unified OAS paradigm**: All external storage uses OAS infrastructure
+2. **Content-addressed region**: `_content/` folder for deduplicated objects
+3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection
+4. **ObjectRef everywhere**: External types return ObjectRef for consistent lazy access
+5. **Deduplication**: Content hash determines identity; identical content stored once
 
-4. **Ref vs filepath**: Deprecate `filepath@store` entirely or keep as alias?
+## Migration from Legacy `~external_*`
 
-5. **Content hash format**: SHA256 hex (64 chars) or shorter hash?
+For existing schemas with `~external_*` tables:
 
-## Implementation Phases
+1. Read legacy external references
+2. Re-upload to `_content/` region
+3. Update column values to content hashes
+4. Drop `~external_*` tables
+5. Create `ContentRegistry` entries
 
-### Phase 1: `<attach>` Internal
-- Implement AttachType for internal blob storage
-- Deprecate bare `attach` keyword (still works, warns)
-
-### Phase 2: Content-Addressed External
-- Implement ContentAddressedType base
-- Add `<djblob@store>` and `<attach@store>`
-- Store in OAS `_external/` folder
-
-### Phase 3: Reference Type
-- Implement `<ref@store>` with Ref handle
-- Deprecate `filepath@store`
+## Open Questions
 
-### Phase 4: Migration Tools
-- Tools to migrate `~external_*` data to new format
-- Backward compat layer for reading old format
+1. **Hash collision**: SHA256 is effectively collision-free, but should we verify on fetch?
+2. **Partial uploads**: How to handle interrupted uploads? Temp path then rename?
+3. **Cross-schema deduplication**: Should `_content/` be per-schema or global?
+4. **Backward compat**: How long to support reading from legacy `~external_*`?

From 495d7f7d667845296c3fd4f1eacbe5e81443e99e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:35:36 +0000
Subject: [PATCH 13/41] Make <djblob@store> and <attach@store> return values
 transparently

- <djblob@store> returns Python object (fetched and deserialized)
- <attach@store> returns local file path (downloaded automatically)
- Only object@store returns ObjectRef for explicit lazy access
- External storage is transparent - @store only affects where, not how

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 94 ++++++++++----------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 844564755..6b90ac164 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -43,15 +43,15 @@ class Analysis(dj.Computed):
 - Path derived from content hash (SHA256)
 - Many-to-one: multiple rows can reference same object
 - Reference counted for garbage collection
-- Returns `ObjectRef` for lazy access (same as regular OAS)
+- **Transparent access**: Returns same type as internal variant (Python object or file path)
 
 ```python
 class ProcessedData(dj.Computed):
     definition = """
     -> RawData
     ---
-    features : <djblob@main>     # Serialized Python object, deduplicated
-    source_file : <attach@main>  # File attachment, deduplicated
+    features : <djblob@main>     # Returns Python object (fetched transparently)
+    source_file : <attach@main>  # Returns local file path (downloaded transparently)
     """
 ```
 
@@ -118,23 +118,27 @@ def garbage_collect(schema):
         (ContentRegistry() & {'content_hash': content_hash}).delete()
 ```
 
-### ObjectRef for Content-Addressed Objects
+### Transparent Access for Content-Addressed Objects
 
-Content-addressed objects return `ObjectRef` just like regular OAS objects:
+Content-addressed objects return the same types as their internal counterparts:
 
 ```python
 row = (ProcessedData & key).fetch1()
 
-# Both return ObjectRef
-results_ref = row['features']      # <djblob@store>
-file_ref = row['source_file']      # <attach@store>
+# <djblob@store> returns Python object (like <djblob>)
+features = row['features']         # dict, array, etc. - fetched and deserialized
 
-# Same interface as regular OAS
-results_ref.download('/local/path')
-data = results_ref.load()          # For djblob: deserialize
-local_path = file_ref.download()   # For attach: download, return path
+# <attach@store> returns local file path (like <attach>)
+local_path = row['source_file']    # '/downloads/data.csv' - downloaded automatically
+
+# Only object@store returns ObjectRef for explicit lazy access
+ref = row['results']               # ObjectRef - user controls when to download
 ```
 
+This makes external storage transparent - users work with Python objects and file paths,
+not storage references. The `@store` suffix only affects where data is stored, not how
+it's accessed.
+
 ## AttributeType Implementations
 
 ### `<djblob>` - Internal Serialized Blob
@@ -180,13 +184,12 @@ class DJBlobExternalType(AttributeType):
 
         return content_hash
 
-    def decode(self, content_hash, *, key=None, store=None) -> ObjectRef:
-        # Return ObjectRef for lazy access
-        return ObjectRef(
-            path=content_path(content_hash),
-            store=store,
-            loader=blob.unpack  # Custom loader for deserialization
-        )
+    def decode(self, content_hash, *, key=None, store=None) -> Any:
+        # Fetch and deserialize - transparent to user
+        from . import blob
+        path = content_path(content_hash)
+        data = store.get(path)
+        return blob.unpack(data)
 ```
 
 ### `<attach>` - Internal File Attachment
@@ -227,7 +230,7 @@ class AttachExternalType(AttributeType):
             path.name.encode() + b"\0" + data
         ).hexdigest()
 
-        # Store as folder with original filename preserved
+        # Store with original filename preserved
         obj_path = content_path(content_hash)
         if not store.exists(obj_path):
             store.put(f"{obj_path}/{path.name}", data)
@@ -239,26 +242,29 @@ class AttachExternalType(AttributeType):
 
         return content_hash
 
-    def decode(self, content_hash, *, key=None, store=None) -> ObjectRef:
-        return ObjectRef(
-            path=content_path(content_hash),
-            store=store,
-            # ObjectRef handles file download
-        )
+    def decode(self, content_hash, *, key=None, store=None) -> str:
+        # Download and return local path - transparent to user
+        obj_path = content_path(content_hash)
+        # List to get filename (stored as {hash}/{filename})
+        filename = store.list(obj_path)[0]
+        download_path = Path(dj.config['download_path']) / filename
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        store.download(f"{obj_path}/{filename}", download_path)
+        return str(download_path)
 ```
 
-## Unified ObjectRef Interface
+## ObjectRef Interface (for `object@store` only)
 
-All external storage (both path-addressed and content-addressed) returns `ObjectRef`:
+Only `object@store` returns `ObjectRef` for explicit lazy access. This is intentional -
+large files and folders (Zarr, HDF5, etc.) benefit from user-controlled download/access.
 
 ```python
 class ObjectRef:
-    """Lazy reference to stored object."""
+    """Lazy reference to stored object (object@store only)."""
 
-    def __init__(self, path, store, loader=None):
+    def __init__(self, path, store):
         self.path = path
         self.store = store
-        self._loader = loader  # Optional custom deserializer
 
     def download(self, local_path=None) -> Path:
         """Download object to local filesystem."""
@@ -267,35 +273,33 @@ class ObjectRef:
         self.store.download(self.path, local_path)
         return local_path
 
-    def load(self) -> Any:
-        """Load and optionally deserialize object."""
-        data = self.store.get(self.path)
-        if self._loader:
-            return self._loader(data)
-        return data
-
     def open(self, mode='rb'):
-        """Open via fsspec for streaming access."""
+        """Open via fsspec for streaming/direct access."""
         return self.store.open(self.path, mode)
+
+    def exists(self) -> bool:
+        """Check if object exists in store."""
+        return self.store.exists(self.path)
 ```
 
 ## Summary
 
 | Type | Storage | Column | Dedup | Returns |
 |------|---------|--------|-------|---------|
-| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef |
+| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef (lazy) |
 | `<djblob>` | Internal DB | LONGBLOB | No | Python object |
-| `<djblob@store>` | `_content/{hash}/` | char(64) | Yes | ObjectRef |
-| `<attach>` | Internal DB | LONGBLOB | No | Local path |
-| `<attach@store>` | `_content/{hash}/` | char(64) | Yes | ObjectRef |
+| `<djblob@store>` | `_content/{hash}/` | char(64) | Yes | Python object |
+| `<attach>` | Internal DB | LONGBLOB | No | Local file path |
+| `<attach@store>` | `_content/{hash}/` | char(64) | Yes | Local file path |
 
 ## Key Design Decisions
 
 1. **Unified OAS paradigm**: All external storage uses OAS infrastructure
 2. **Content-addressed region**: `_content/` folder for deduplicated objects
 3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection
-4. **ObjectRef everywhere**: External types return ObjectRef for consistent lazy access
-5. **Deduplication**: Content hash determines identity; identical content stored once
+4. **Transparent access**: `<djblob@store>` and `<attach@store>` return same types as internal variants
+5. **Lazy access for objects**: Only `object@store` returns ObjectRef (for large files/folders)
+6. **Deduplication**: Content hash determines identity; identical content stored once
 
 ## Migration from Legacy `~external_*`
 

From 7ae8f1577396b08a61eb8f271a69e0f508a87103 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:44:06 +0000
Subject: [PATCH 14/41] Introduce layered storage architecture with content
 core type

Three-layer architecture:
1. MySQL types: longblob, varchar, etc.
2. Core DataJoint types: object, content (and @store variants)
3. AttributeTypes: <djblob>, <xblob>, <attach>, <xattach>

New core type `content` for content-addressed storage:
- Accepts bytes, returns bytes
- Handles hashing, deduplication, and GC registration
- AttributeTypes like <xblob> build serialization on top

Naming convention:
- <djblob> = internal serialized (database)
- <xblob> = external serialized (content-addressed)
- <attach> = internal file
- <xattach> = external file

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 398 +++++++++----------
 1 file changed, 190 insertions(+), 208 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 6b90ac164..3b48bb50a 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -2,152 +2,107 @@
 
 ## Overview
 
-This document proposes a unified storage architecture where all external storage uses the Object-Augmented Schema (OAS) paradigm, with a special content-addressable region for deduplicated objects.
+This document defines a layered storage architecture:
 
-## Architecture
+1. **MySQL types**: `longblob`, `varchar`, `int`, etc.
+2. **Core DataJoint types**: `object`, `content` (and their `@store` variants)
+3. **AttributeTypes**: `<djblob>`, `<xblob>`, `<attach>`, etc. (built on top of core types)
 
-### Two Storage Modes within OAS
+## Core Types
 
-```
-store_root/
-├── {schema}/{table}/{pk}/           # Path-addressed (regular OAS)
-│   └── {attribute}/                 # Derived from primary key
-│       └── ...                      # Files, folders, Zarr, etc.
-│
-└── _content/                        # Content-addressed (deduplicated)
-    └── {hash[:2]}/{hash[2:4]}/
-        └── {hash}/                  # Full SHA256 hash
-            └── ...                  # Object contents
-```
+### `object` / `object@store` - Path-Addressed Storage
 
-### 1. Path-Addressed Objects (`object@store`)
+**Already implemented.** OAS (Object-Augmented Schema) storage:
 
-**Already implemented.** Regular OAS behavior:
-- Path derived from primary key
+- Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/`
 - One-to-one relationship with table row
 - Deleted when row is deleted
 - Returns `ObjectRef` for lazy access
+- Supports direct writes (Zarr, HDF5) via fsspec
 
 ```python
 class Analysis(dj.Computed):
     definition = """
     -> Recording
     ---
-    results : object@main
+    results : object          # default store
+    archive : object@cold     # specific store
     """
 ```
 
-### 2. Content-Addressed Objects (`<djblob@store>`, `<attach@store>`)
+### `content` / `content@store` - Content-Addressed Storage
 
-**New.** Stored in `_content/` region with deduplication:
-- Path derived from content hash (SHA256)
-- Many-to-one: multiple rows can reference same object
-- Reference counted for garbage collection
-- **Transparent access**: Returns same type as internal variant (Python object or file path)
+**New core type.** Content-addressed storage with deduplication:
 
-```python
-class ProcessedData(dj.Computed):
-    definition = """
-    -> RawData
-    ---
-    features : <djblob@main>     # Returns Python object (fetched transparently)
-    source_file : <attach@main>  # Returns local file path (downloaded transparently)
-    """
-```
-
-## Content-Addressed Storage Design
-
-### Storage Path
-
-```python
-def content_path(content_hash: str) -> str:
-    """Generate path for content-addressed object."""
-    return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}/`
+- Many-to-one: multiple rows can reference same content
+- Reference counted for garbage collection
+- Deduplication: identical content stored once
 
-# Example: hash "a1b2c3d4..." -> "_content/a1/b2/a1b2c3d4..."
 ```
-
-### Reference Registry
-
-A schema-level table tracks content-addressed objects for reference counting:
-
-```python
-class ContentRegistry:
-    """
-    Tracks content-addressed objects for garbage collection.
-    One per schema, created automatically when content-addressed types are used.
-    """
-    definition = """
-    # Content-addressed object registry
-    content_hash : char(64)          # SHA256 hex
-    ---
-    store        : varchar(64)       # Store name
-    size         : bigint unsigned   # Object size in bytes
-    created      : timestamp DEFAULT CURRENT_TIMESTAMP
-    """
+store_root/
+├── {schema}/{table}/{pk}/     # object storage (path-addressed)
+│   └── {attribute}/
+│
+└── _content/                   # content storage (content-addressed)
+    └── {hash[:2]}/{hash[2:4]}/{hash}/
 ```
 
-### Reference Counting
+#### Content Type Behavior
 
-Reference counting is implicit via database queries:
+The `content` core type:
+- Accepts `bytes` on insert
+- Computes SHA256 hash of the content
+- Stores in `_content/{hash}/` if not already present (deduplication)
+- Returns `bytes` on fetch (transparent retrieval)
+- Registers in `ContentRegistry` for GC tracking
 
 ```python
-def find_orphans(schema) -> list[tuple[str, str]]:
-    """Find content hashes not referenced by any table."""
+# Core type behavior (built-in, not an AttributeType)
+class ContentType:
+    """Core content-addressed storage type."""
 
-    # Get all registered hashes
-    registered = set(ContentRegistry().fetch('content_hash', 'store'))
+    def store(self, data: bytes, store_backend) -> str:
+        """Store content, return hash."""
+        content_hash = hashlib.sha256(data).hexdigest()
+        path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
 
-    # Get all referenced hashes from tables
-    referenced = set()
-    for table in schema.tables:
-        for attr in table.heading.attributes:
-            if attr.is_content_addressed:
-                hashes = table.fetch(attr.name)
-                referenced.update((h, attr.store) for h in hashes)
+        if not store_backend.exists(path):
+            store_backend.put(path, data)
+            ContentRegistry().insert1({
+                'content_hash': content_hash,
+                'store': store_backend.name,
+                'size': len(data)
+            })
 
-    return registered - referenced
+        return content_hash
 
-def garbage_collect(schema):
-    """Remove orphaned content-addressed objects."""
-    for content_hash, store in find_orphans(schema):
-        # Delete from storage
-        store_backend = get_store(store)
-        store_backend.delete(content_path(content_hash))
-        # Delete from registry
-        (ContentRegistry() & {'content_hash': content_hash}).delete()
+    def retrieve(self, content_hash: str, store_backend) -> bytes:
+        """Retrieve content by hash."""
+        path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+        return store_backend.get(path)
 ```
 
-### Transparent Access for Content-Addressed Objects
-
-Content-addressed objects return the same types as their internal counterparts:
-
-```python
-row = (ProcessedData & key).fetch1()
-
-# <djblob@store> returns Python object (like <djblob>)
-features = row['features']         # dict, array, etc. - fetched and deserialized
+#### Database Column
 
-# <attach@store> returns local file path (like <attach>)
-local_path = row['source_file']    # '/downloads/data.csv' - downloaded automatically
+The `content` type stores a `char(64)` hash in the database:
 
-# Only object@store returns ObjectRef for explicit lazy access
-ref = row['results']               # ObjectRef - user controls when to download
+```sql
+-- content column
+features CHAR(64) NOT NULL  -- SHA256 hex hash
 ```
 
-This makes external storage transparent - users work with Python objects and file paths,
-not storage references. The `@store` suffix only affects where data is stored, not how
-it's accessed.
-
-## AttributeType Implementations
+## AttributeTypes (Built on Core Types)
 
 ### `<djblob>` - Internal Serialized Blob
 
+Serialized Python object stored in database.
+
 ```python
 @dj.register_type
 class DJBlobType(AttributeType):
     type_name = "djblob"
-    dtype = "longblob"
+    dtype = "longblob"  # MySQL type
 
     def encode(self, value, *, key=None) -> bytes:
         from . import blob
@@ -158,42 +113,42 @@ class DJBlobType(AttributeType):
         return blob.unpack(stored)
 ```
 
-### `<djblob@store>` - External Serialized Blob (Content-Addressed)
+### `<xblob>` / `<xblob@store>` - External Serialized Blob
+
+Serialized Python object stored in content-addressed storage.
 
 ```python
 @dj.register_type
-class DJBlobExternalType(AttributeType):
-    type_name = "djblob"
-    dtype = "char(64)"  # Content hash stored in column
-    is_content_addressed = True
+class XBlobType(AttributeType):
+    type_name = "xblob"
+    dtype = "content"  # Core type - uses default store
+    # dtype = "content@store" for specific store
 
-    def encode(self, value, *, key=None, store=None) -> str:
+    def encode(self, value, *, key=None) -> bytes:
         from . import blob
-        data = blob.pack(value, compress=True)
-        content_hash = hashlib.sha256(data).hexdigest()
-
-        # Upload if not exists (deduplication)
-        path = content_path(content_hash)
-        if not store.exists(path):
-            store.put(path, data)
-            ContentRegistry().insert1({
-                'content_hash': content_hash,
-                'store': store.name,
-                'size': len(data)
-            })
-
-        return content_hash
+        return blob.pack(value, compress=True)
 
-    def decode(self, content_hash, *, key=None, store=None) -> Any:
-        # Fetch and deserialize - transparent to user
+    def decode(self, stored, *, key=None) -> Any:
         from . import blob
-        path = content_path(content_hash)
-        data = store.get(path)
-        return blob.unpack(data)
+        return blob.unpack(stored)
+```
+
+Usage:
+```python
+class ProcessedData(dj.Computed):
+    definition = """
+    -> RawData
+    ---
+    small_result : <djblob>        # internal (in database)
+    large_result : <xblob>         # external (default store)
+    archive_result : <xblob@cold>  # external (specific store)
+    """
 ```
 
 ### `<attach>` - Internal File Attachment
 
+File stored in database with filename preserved.
+
 ```python
 @dj.register_type
 class AttachType(AttributeType):
@@ -213,107 +168,134 @@ class AttachType(AttributeType):
         return str(download_path)
 ```
 
-### `<attach@store>` - External File Attachment (Content-Addressed)
+### `<xattach>` / `<xattach@store>` - External File Attachment
+
+File stored in content-addressed storage with filename preserved.
 
 ```python
 @dj.register_type
-class AttachExternalType(AttributeType):
-    type_name = "attach"
-    dtype = "char(64)"  # Content hash stored in column
-    is_content_addressed = True
+class XAttachType(AttributeType):
+    type_name = "xattach"
+    dtype = "content"  # Core type
 
-    def encode(self, filepath, *, key=None, store=None) -> str:
+    def encode(self, filepath, *, key=None) -> bytes:
         path = Path(filepath)
-        data = path.read_bytes()
-        # Hash includes filename for uniqueness
-        content_hash = hashlib.sha256(
-            path.name.encode() + b"\0" + data
-        ).hexdigest()
-
-        # Store with original filename preserved
-        obj_path = content_path(content_hash)
-        if not store.exists(obj_path):
-            store.put(f"{obj_path}/{path.name}", data)
-            ContentRegistry().insert1({
-                'content_hash': content_hash,
-                'store': store.name,
-                'size': len(data)
-            })
-
-        return content_hash
+        # Include filename in stored data
+        return path.name.encode() + b"\0" + path.read_bytes()
 
-    def decode(self, content_hash, *, key=None, store=None) -> str:
-        # Download and return local path - transparent to user
-        obj_path = content_path(content_hash)
-        # List to get filename (stored as {hash}/{filename})
-        filename = store.list(obj_path)[0]
+    def decode(self, stored, *, key=None) -> str:
+        filename, contents = stored.split(b"\0", 1)
+        filename = filename.decode()
         download_path = Path(dj.config['download_path']) / filename
         download_path.parent.mkdir(parents=True, exist_ok=True)
-        store.download(f"{obj_path}/{filename}", download_path)
+        download_path.write_bytes(contents)
         return str(download_path)
 ```
 
-## ObjectRef Interface (for `object@store` only)
+Usage:
+```python
+class Attachments(dj.Manual):
+    definition = """
+    attachment_id : int
+    ---
+    config : <attach>           # internal (small file in DB)
+    data_file : <xattach>       # external (default store)
+    archive : <xattach@cold>    # external (specific store)
+    """
+```
 
-Only `object@store` returns `ObjectRef` for explicit lazy access. This is intentional -
-large files and folders (Zarr, HDF5, etc.) benefit from user-controlled download/access.
+## Type Layering Summary
 
-```python
-class ObjectRef:
-    """Lazy reference to stored object (object@store only)."""
-
-    def __init__(self, path, store):
-        self.path = path
-        self.store = store
-
-    def download(self, local_path=None) -> Path:
-        """Download object to local filesystem."""
-        if local_path is None:
-            local_path = Path(dj.config['download_path']) / Path(self.path).name
-        self.store.download(self.path, local_path)
-        return local_path
-
-    def open(self, mode='rb'):
-        """Open via fsspec for streaming/direct access."""
-        return self.store.open(self.path, mode)
-
-    def exists(self) -> bool:
-        """Check if object exists in store."""
-        return self.store.exists(self.path)
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     AttributeTypes                          │
+│  <djblob>   <xblob>   <attach>   <xattach>   <custom>      │
+├─────────────────────────────────────────────────────────────┤
+│                    Core DataJoint Types                     │
+│     longblob        content        object                   │
+│                   content@store   object@store              │
+├─────────────────────────────────────────────────────────────┤
+│                      MySQL Types                            │
+│  LONGBLOB    CHAR(64)    JSON    VARCHAR    INT    etc.    │
+└─────────────────────────────────────────────────────────────┘
 ```
 
-## Summary
+## Storage Comparison
 
-| Type | Storage | Column | Dedup | Returns |
-|------|---------|--------|-------|---------|
-| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef (lazy) |
-| `<djblob>` | Internal DB | LONGBLOB | No | Python object |
-| `<djblob@store>` | `_content/{hash}/` | char(64) | Yes | Python object |
-| `<attach>` | Internal DB | LONGBLOB | No | Local file path |
-| `<attach@store>` | `_content/{hash}/` | char(64) | Yes | Local file path |
+| AttributeType | Core Type | Storage Location | Dedup | Returns |
+|---------------|-----------|------------------|-------|---------|
+| `<djblob>` | `longblob` | Database | No | Python object |
+| `<xblob>` | `content` | `_content/{hash}/` | Yes | Python object |
+| `<xblob@store>` | `content@store` | `_content/{hash}/` | Yes | Python object |
+| `<attach>` | `longblob` | Database | No | Local file path |
+| `<xattach>` | `content` | `_content/{hash}/` | Yes | Local file path |
+| `<xattach@store>` | `content@store` | `_content/{hash}/` | Yes | Local file path |
+| — | `object` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| — | `object@store` | `{schema}/{table}/{pk}/` | No | ObjectRef |
 
-## Key Design Decisions
+## Reference Counting for Content Type
+
+The `ContentRegistry` table tracks content-addressed objects:
+
+```python
+class ContentRegistry:
+    definition = """
+    # Content-addressed object registry
+    content_hash : char(64)          # SHA256 hex
+    ---
+    store        : varchar(64)       # Store name
+    size         : bigint unsigned   # Size in bytes
+    created      : timestamp DEFAULT CURRENT_TIMESTAMP
+    """
+```
 
-1. **Unified OAS paradigm**: All external storage uses OAS infrastructure
-2. **Content-addressed region**: `_content/` folder for deduplicated objects
-3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection
-4. **Transparent access**: `<djblob@store>` and `<attach@store>` return same types as internal variants
-5. **Lazy access for objects**: Only `object@store` returns ObjectRef (for large files/folders)
-6. **Deduplication**: Content hash determines identity; identical content stored once
+Garbage collection finds orphaned content:
 
-## Migration from Legacy `~external_*`
+```python
+def garbage_collect(schema):
+    """Remove content not referenced by any table."""
+    # Get all registered hashes
+    registered = set(ContentRegistry().fetch('content_hash', 'store'))
+
+    # Get all referenced hashes from tables with content-type columns
+    referenced = set()
+    for table in schema.tables:
+        for attr in table.heading.attributes:
+            if attr.type in ('content', 'content@...'):
+                hashes = table.fetch(attr.name)
+                referenced.update((h, attr.store) for h in hashes)
+
+    # Delete orphaned content
+    for content_hash, store in (registered - referenced):
+        store_backend = get_store(store)
+        store_backend.delete(content_path(content_hash))
+        (ContentRegistry() & {'content_hash': content_hash}).delete()
+```
 
-For existing schemas with `~external_*` tables:
+## Key Design Decisions
 
-1. Read legacy external references
-2. Re-upload to `_content/` region
-3. Update column values to content hashes
-4. Drop `~external_*` tables
-5. Create `ContentRegistry` entries
+1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes
+2. **Content type**: New core type for content-addressed, deduplicated storage
+3. **Naming convention**:
+   - `<djblob>` = internal serialized (database)
+   - `<xblob>` = external serialized (content-addressed)
+   - `<attach>` = internal file
+   - `<xattach>` = external file
+4. **Transparent access**: AttributeTypes return Python objects or file paths, not references
+5. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef
+
+## Migration from Legacy Types
+
+| Legacy | New Equivalent |
+|--------|----------------|
+| `longblob` (auto-serialized) | `<djblob>` |
+| `blob@store` | `<xblob@store>` |
+| `attach` | `<attach>` |
+| `attach@store` | `<xattach@store>` |
+| `filepath@store` | Deprecated (use `object@store` or `<xattach@store>`) |
 
 ## Open Questions
 
-1. **Hash collision**: SHA256 is effectively collision-free, but should we verify on fetch?
-2. **Partial uploads**: How to handle interrupted uploads? Temp path then rename?
-3. **Cross-schema deduplication**: Should `_content/` be per-schema or global?
-4. **Backward compat**: How long to support reading from legacy `~external_*`?
+1. Should `content` without `@store` use a default store, or require explicit store?
+2. Should we support `<xblob>` without `@store` syntax (implying default store)?
+3. Should `filepath@store` be kept for backward compat or fully deprecated?

From 6fcc4d310b1996f8734f6a2bb898f4be817f41c4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:50:31 +0000
Subject: [PATCH 15/41] Add parameterized AttributeTypes and content vs object
 comparison

- content type is single-blob only (no folders)
- Parameterized syntax: <type@param> passes param to dtype
- Add content vs object comparison table
- Clarify when to use each type

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 57 +++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 3b48bb50a..09e3ebecf 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -34,10 +34,12 @@ class Analysis(dj.Computed):
 
 **New core type.** Content-addressed storage with deduplication:
 
-- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}/`
+- **Single blob only**: stores a single file or serialized object (not folders)
+- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}`
 - Many-to-one: multiple rows can reference same content
 - Reference counted for garbage collection
 - Deduplication: identical content stored once
+- For folders/complex objects, use `object` type instead
 
 ```
 store_root/
@@ -92,6 +94,31 @@ The `content` type stores a `char(64)` hash in the database:
 features CHAR(64) NOT NULL  -- SHA256 hex hash
 ```
 
+## Parameterized AttributeTypes
+
+AttributeTypes can be parameterized with `<type@param>` syntax. The parameter is passed
+through to the underlying dtype:
+
+```python
+class AttributeType:
+    type_name: str      # Name used in <brackets>
+    dtype: str          # Base underlying type
+
+    # When user writes <type_name@param>, resolved dtype becomes:
+    # f"{dtype}@{param}" if param specified, else dtype
+```
+
+**Resolution examples:**
+```
+<xblob>       → dtype = "content"       → default store
+<xblob@cold>  → dtype = "content@cold"  → cold store
+<djblob>      → dtype = "longblob"      → database
+<djblob@x>    → ERROR: longblob doesn't support parameters
+```
+
+This means `<xblob>` and `<xblob@store>` share the same AttributeType class - the
+parameter flows through to the core type, which validates whether it supports `@store`.
+
 ## AttributeTypes (Built on Core Types)
 
 ### `<djblob>` - Internal Serialized Blob
@@ -272,17 +299,33 @@ def garbage_collect(schema):
         (ContentRegistry() & {'content_hash': content_hash}).delete()
 ```
 
+## Content vs Object: When to Use Each
+
+| Feature | `content` | `object` |
+|---------|-----------|----------|
+| Addressing | Content hash (SHA256) | Path (from primary key) |
+| Deduplication | Yes | No |
+| Structure | Single blob only | Files, folders, Zarr, HDF5 |
+| Access | Transparent (returns bytes) | Lazy (returns ObjectRef) |
+| GC | Reference counted | Deleted with row |
+| Use case | Serialized data, file attachments | Large/complex objects, streaming |
+
+**Rule of thumb:**
+- Need deduplication or storing serialized Python objects? → `content` via `<xblob>`
+- Need folders, Zarr, HDF5, or streaming access? → `object`
+
 ## Key Design Decisions
 
 1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes
-2. **Content type**: New core type for content-addressed, deduplicated storage
-3. **Naming convention**:
+2. **Content type**: Single-blob, content-addressed, deduplicated storage
+3. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
+4. **Naming convention**:
    - `<djblob>` = internal serialized (database)
    - `<xblob>` = external serialized (content-addressed)
-   - `<attach>` = internal file
-   - `<xattach>` = external file
-4. **Transparent access**: AttributeTypes return Python objects or file paths, not references
-5. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef
+   - `<attach>` = internal file (single file)
+   - `<xattach>` = external file (single file)
+5. **Transparent access**: AttributeTypes return Python objects or file paths, not references
+6. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef
 
 ## Migration from Legacy Types
 

From b87342bddc4afe1d7ba14ed863ee2af08825f30f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 05:53:34 +0000
Subject: [PATCH 16/41] Make content storage per-project and add migration
 utility

- Content-addressed storage is now per-project (not per-schema)
- Deduplication works across all schemas in a project
- ContentRegistry is project-level (e.g., {project}_content database)
- GC scans all schemas in project for references
- Add migration utility for legacy ~external_* per-schema stores
- Document migration from binary(16) UUID to char(64) SHA256 hash

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 90 +++++++++++++++++---
 1 file changed, 77 insertions(+), 13 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 09e3ebecf..381cbf1c5 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -35,10 +35,11 @@ class Analysis(dj.Computed):
 **New core type.** Content-addressed storage with deduplication:
 
 - **Single blob only**: stores a single file or serialized object (not folders)
+- **Per-project scope**: content is shared across all schemas in a project (not per-schema)
 - Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}`
-- Many-to-one: multiple rows can reference same content
+- Many-to-one: multiple rows (even across schemas) can reference same content
 - Reference counted for garbage collection
-- Deduplication: identical content stored once
+- Deduplication: identical content stored once across the entire project
 - For folders/complex objects, use `object` type instead
 
 ```
@@ -262,12 +263,17 @@ class Attachments(dj.Manual):
 
 ## Reference Counting for Content Type
 
-The `ContentRegistry` table tracks content-addressed objects:
+The `ContentRegistry` is a **project-level** table that tracks content-addressed objects
+across all schemas. This differs from the legacy `~external_*` tables which were per-schema.
 
 ```python
 class ContentRegistry:
+    """
+    Project-level content registry.
+    Stored in a designated database (e.g., `{project}_content`).
+    """
     definition = """
-    # Content-addressed object registry
+    # Content-addressed object registry (project-wide)
     content_hash : char(64)          # SHA256 hex
     ---
     store        : varchar(64)       # Store name
@@ -276,21 +282,22 @@ class ContentRegistry:
     """
 ```
 
-Garbage collection finds orphaned content:
+Garbage collection scans **all schemas** in the project:
 
 ```python
-def garbage_collect(schema):
-    """Remove content not referenced by any table."""
+def garbage_collect(project):
+    """Remove content not referenced by any table in any schema."""
     # Get all registered hashes
     registered = set(ContentRegistry().fetch('content_hash', 'store'))
 
-    # Get all referenced hashes from tables with content-type columns
+    # Get all referenced hashes from ALL schemas in the project
     referenced = set()
-    for table in schema.tables:
-        for attr in table.heading.attributes:
-            if attr.type in ('content', 'content@...'):
-                hashes = table.fetch(attr.name)
-                referenced.update((h, attr.store) for h in hashes)
+    for schema in project.schemas:
+        for table in schema.tables:
+            for attr in table.heading.attributes:
+                if attr.type in ('content', 'content@...'):
+                    hashes = table.fetch(attr.name)
+                    referenced.update((h, attr.store) for h in hashes)
 
     # Delete orphaned content
     for content_hash, store in (registered - referenced):
@@ -337,8 +344,65 @@ def garbage_collect(schema):
 | `attach@store` | `<xattach@store>` |
 | `filepath@store` | Deprecated (use `object@store` or `<xattach@store>`) |
 
+### Migration from Legacy `~external_*` Stores
+
+Legacy external storage used per-schema `~external_{store}` tables. Migration to the new
+per-project `ContentRegistry` requires:
+
+```python
+def migrate_external_store(schema, store_name):
+    """
+    Migrate legacy ~external_{store} to new ContentRegistry.
+
+    1. Read all entries from ~external_{store}
+    2. For each entry:
+       - Fetch content from legacy location
+       - Compute SHA256 hash
+       - Copy to _content/{hash}/ if not exists
+       - Update table column from UUID to hash
+       - Register in ContentRegistry
+    3. After all schemas migrated, drop ~external_{store} tables
+    """
+    external_table = schema.external[store_name]
+
+    for entry in external_table.fetch(as_dict=True):
+        legacy_uuid = entry['hash']
+
+        # Fetch content from legacy location
+        content = external_table.get(legacy_uuid)
+
+        # Compute new content hash
+        content_hash = hashlib.sha256(content).hexdigest()
+
+        # Store in new location if not exists
+        new_path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+        store = get_store(store_name)
+        if not store.exists(new_path):
+            store.put(new_path, content)
+
+        # Register in project-wide ContentRegistry
+        ContentRegistry().insert1({
+            'content_hash': content_hash,
+            'store': store_name,
+            'size': len(content)
+        }, skip_duplicates=True)
+
+        # Update referencing tables (UUID -> hash)
+        # ... update all tables that reference this UUID ...
+
+    # After migration complete for all schemas:
+    # DROP TABLE `{schema}`.`~external_{store}`
+```
+
+**Migration considerations:**
+- Legacy UUIDs were based on content hash but stored as `binary(16)`
+- New system uses `char(64)` SHA256 hex strings
+- Migration can be done incrementally per schema
+- Backward compatibility layer can read both formats during transition
+
 ## Open Questions
 
 1. Should `content` without `@store` use a default store, or require explicit store?
 2. Should we support `<xblob>` without `@store` syntax (implying default store)?
 3. Should `filepath@store` be kept for backward compat or fully deprecated?
+4. How long should the backward compatibility layer support legacy `~external_*` format?

From 40c1dbbca99517ca8a64bdf41a4994219addd31d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:01:02 +0000
Subject: [PATCH 17/41] Add filepath as third OAS region with ObjectRef
 interface

Three OAS storage regions:
1. object: {schema}/{table}/{pk}/ - PK-addressed, DataJoint controls
2. content: _content/{hash} - content-addressed, deduplicated
3. filepath: _files/{user-path} - user-addressed, user controls

Upgraded filepath@store:
- Returns ObjectRef (lazy) instead of copying files
- Supports streaming via ref.open()
- Supports folders (like object)
- Stores checksum in JSON column for verification
- No more automatic copy to local stage

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 188 ++++++++++++++-----
 1 file changed, 145 insertions(+), 43 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 381cbf1c5..7ca4522c6 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -5,9 +5,17 @@
 This document defines a layered storage architecture:
 
 1. **MySQL types**: `longblob`, `varchar`, `int`, etc.
-2. **Core DataJoint types**: `object`, `content` (and their `@store` variants)
+2. **Core DataJoint types**: `object`, `content`, `filepath` (and their `@store` variants)
 3. **AttributeTypes**: `<djblob>`, `<xblob>`, `<attach>`, etc. (built on top of core types)
 
+### Three OAS Storage Regions
+
+| Region | Path Pattern | Addressing | Use Case |
+|--------|--------------|------------|----------|
+| Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 |
+| Content | `_content/{hash}` | Content hash | Deduplicated blobs/files |
+| Filepath | `_files/{user-path}` | User-defined | User-organized files |
+
 ## Core Types
 
 ### `object` / `object@store` - Path-Addressed Storage
@@ -44,11 +52,14 @@ class Analysis(dj.Computed):
 
 ```
 store_root/
-├── {schema}/{table}/{pk}/     # object storage (path-addressed)
+├── {schema}/{table}/{pk}/     # object storage (path-addressed by PK)
 │   └── {attribute}/
 │
-└── _content/                   # content storage (content-addressed)
-    └── {hash[:2]}/{hash[2:4]}/{hash}/
+├── _content/                   # content storage (content-addressed)
+│   └── {hash[:2]}/{hash[2:4]}/{hash}
+│
+└── _files/                     # filepath storage (user-addressed)
+    └── {user-defined-path}
 ```
 
 #### Content Type Behavior
@@ -95,6 +106,92 @@ The `content` type stores a `char(64)` hash in the database:
 features CHAR(64) NOT NULL  -- SHA256 hex hash
 ```
 
+### `filepath` / `filepath@store` - User-Addressed Storage
+
+**Upgraded from legacy.** User-defined path organization with ObjectRef access:
+
+- **User controls paths**: relative path specified by user (not derived from PK or hash)
+- Stored in `_files/{user-path}` within the store
+- Returns `ObjectRef` for lazy access (no automatic copying)
+- Stores checksum in database for verification
+- Supports files and folders (like `object`)
+
+```python
+class RawData(dj.Manual):
+    definition = """
+    session_id : int
+    ---
+    recording : filepath@raw      # user specifies path
+    """
+
+# Insert - user provides relative path
+table.insert1({
+    'session_id': 1,
+    'recording': 'experiment_001/session_001/data.nwb'
+})
+
+# Fetch - returns ObjectRef (lazy, no copy)
+row = (table & 'session_id=1').fetch1()
+ref = row['recording']           # ObjectRef
+ref.download('/local/path')      # explicit download
+ref.open()                       # fsspec streaming access
+```
+
+#### Filepath Type Behavior
+
+```python
+# Core type behavior
+class FilepathType:
+    """Core user-addressed storage type."""
+
+    def store(self, user_path: str, store_backend) -> dict:
+        """
+        Register filepath, return metadata.
+        File must already exist at _files/{user_path} in store.
+        """
+        full_path = f"_files/{user_path}"
+        if not store_backend.exists(full_path):
+            raise FileNotFoundError(f"File not found: {full_path}")
+
+        # Compute checksum for verification
+        checksum = store_backend.checksum(full_path)
+        size = store_backend.size(full_path)
+
+        return {
+            'path': user_path,
+            'checksum': checksum,
+            'size': size
+        }
+
+    def retrieve(self, metadata: dict, store_backend) -> ObjectRef:
+        """Return ObjectRef for lazy access."""
+        return ObjectRef(
+            path=f"_files/{metadata['path']}",
+            store=store_backend,
+            checksum=metadata.get('checksum')  # for verification
+        )
+```
+
+#### Database Column
+
+The `filepath` type stores JSON metadata:
+
+```sql
+-- filepath column
+recording JSON NOT NULL
+-- Contains: {"path": "...", "checksum": "...", "size": ...}
+```
+
+#### Key Differences from Legacy `filepath@store`
+
+| Feature | Legacy | New |
+|---------|--------|-----|
+| Access | Copy to local stage | ObjectRef (lazy) |
+| Copying | Automatic | Explicit via `ref.download()` |
+| Streaming | No | Yes via `ref.open()` |
+| Folders | No | Yes |
+| Interface | Returns local path | Returns ObjectRef |
+
 ## Parameterized AttributeTypes
 
 AttributeTypes can be parameterized with `<type@param>` syntax. The parameter is passed
@@ -235,31 +332,32 @@ class Attachments(dj.Manual):
 ## Type Layering Summary
 
 ```
-┌─────────────────────────────────────────────────────────────┐
-│                     AttributeTypes                          │
-│  <djblob>   <xblob>   <attach>   <xattach>   <custom>      │
-├─────────────────────────────────────────────────────────────┤
-│                    Core DataJoint Types                     │
-│     longblob        content        object                   │
-│                   content@store   object@store              │
-├─────────────────────────────────────────────────────────────┤
-│                      MySQL Types                            │
-│  LONGBLOB    CHAR(64)    JSON    VARCHAR    INT    etc.    │
-└─────────────────────────────────────────────────────────────┘
+┌───────────────────────────────────────────────────────────────────┐
+│                        AttributeTypes                              │
+│  <djblob>   <xblob>   <attach>   <xattach>   <custom>             │
+├───────────────────────────────────────────────────────────────────┤
+│                     Core DataJoint Types                           │
+│   longblob     content      object       filepath                  │
+│              content@s    object@s     filepath@s                  │
+├───────────────────────────────────────────────────────────────────┤
+│                        MySQL Types                                 │
+│   LONGBLOB     CHAR(64)      JSON       JSON      VARCHAR   etc.  │
+└───────────────────────────────────────────────────────────────────┘
 ```
 
 ## Storage Comparison
 
-| AttributeType | Core Type | Storage Location | Dedup | Returns |
-|---------------|-----------|------------------|-------|---------|
+| Type | Core Type | Storage Location | Dedup | Returns |
+|------|-----------|------------------|-------|---------|
 | `<djblob>` | `longblob` | Database | No | Python object |
-| `<xblob>` | `content` | `_content/{hash}/` | Yes | Python object |
-| `<xblob@store>` | `content@store` | `_content/{hash}/` | Yes | Python object |
+| `<xblob>` | `content` | `_content/{hash}` | Yes | Python object |
+| `<xblob@s>` | `content@s` | `_content/{hash}` | Yes | Python object |
 | `<attach>` | `longblob` | Database | No | Local file path |
-| `<xattach>` | `content` | `_content/{hash}/` | Yes | Local file path |
-| `<xattach@store>` | `content@store` | `_content/{hash}/` | Yes | Local file path |
-| — | `object` | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| — | `object@store` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `<xattach>` | `content` | `_content/{hash}` | Yes | Local file path |
+| `<xattach@s>` | `content@s` | `_content/{hash}` | Yes | Local file path |
+| `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `filepath@s` | — | `_files/{user-path}` | No | ObjectRef |
 
 ## Reference Counting for Content Type
 
@@ -306,33 +404,37 @@ def garbage_collect(project):
         (ContentRegistry() & {'content_hash': content_hash}).delete()
 ```
 
-## Content vs Object: When to Use Each
+## Core Type Comparison
 
-| Feature | `content` | `object` |
-|---------|-----------|----------|
-| Addressing | Content hash (SHA256) | Path (from primary key) |
-| Deduplication | Yes | No |
-| Structure | Single blob only | Files, folders, Zarr, HDF5 |
-| Access | Transparent (returns bytes) | Lazy (returns ObjectRef) |
-| GC | Reference counted | Deleted with row |
-| Use case | Serialized data, file attachments | Large/complex objects, streaming |
+| Feature | `object` | `content` | `filepath` |
+|---------|----------|-----------|------------|
+| Addressing | Primary key | Content hash | User-defined path |
+| Path control | DataJoint | DataJoint | User |
+| Deduplication | No | Yes | No |
+| Structure | Files, folders, Zarr | Single blob only | Files, folders |
+| Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) |
+| GC | Deleted with row | Reference counted | Deleted with row |
+| Checksum | Optional | Implicit (is the hash) | Stored in DB |
 
-**Rule of thumb:**
-- Need deduplication or storing serialized Python objects? → `content` via `<xblob>`
-- Need folders, Zarr, HDF5, or streaming access? → `object`
+**When to use each:**
+- **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5)
+- **`content`**: Deduplicated serialized data or file attachments via `<xblob>`, `<xattach>`
+- **`filepath`**: User-managed file organization, external data sources
 
 ## Key Design Decisions
 
-1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes
-2. **Content type**: Single-blob, content-addressed, deduplicated storage
-3. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
-4. **Naming convention**:
+1. **Layered architecture**: Core types (`object`, `content`, `filepath`) separate from AttributeTypes
+2. **Three OAS regions**: object (PK-addressed), content (hash-addressed), filepath (user-addressed)
+3. **Content type**: Single-blob, content-addressed, deduplicated storage
+4. **Filepath upgrade**: Returns ObjectRef (lazy) instead of copying files
+5. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
+6. **Naming convention**:
    - `<djblob>` = internal serialized (database)
    - `<xblob>` = external serialized (content-addressed)
    - `<attach>` = internal file (single file)
    - `<xattach>` = external file (single file)
-5. **Transparent access**: AttributeTypes return Python objects or file paths, not references
-6. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef
+7. **Transparent access**: AttributeTypes return Python objects or file paths
+8. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
 
 ## Migration from Legacy Types
 
@@ -342,7 +444,7 @@ def garbage_collect(project):
 | `blob@store` | `<xblob@store>` |
 | `attach` | `<attach>` |
 | `attach@store` | `<xattach@store>` |
-| `filepath@store` | Deprecated (use `object@store` or `<xattach@store>`) |
+| `filepath@store` (copy-based) | `filepath@store` (ObjectRef-based, upgraded) |
 
 ### Migration from Legacy `~external_*` Stores
 
@@ -404,5 +506,5 @@ def migrate_external_store(schema, store_name):
 
 1. Should `content` without `@store` use a default store, or require explicit store?
 2. Should we support `<xblob>` without `@store` syntax (implying default store)?
-3. Should `filepath@store` be kept for backward compat or fully deprecated?
+3. Should `filepath` without `@store` be supported (using default store)?
 4. How long should the backward compatibility layer support legacy `~external_*` format?

From dbf092d76ed7d1f206453a70882da68d2bbbc3cf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:06:06 +0000
Subject: [PATCH 18/41] Redesign filepath as URI reference tracker and add json
 core type

filepath changes:
- No longer an OAS region - tracks external URIs anywhere
- Supports any fsspec-compatible URI (s3://, https://, gs://, etc.)
- Returns ObjectRef for lazy access via fsspec
- No integrity guarantees (external resources may change)
- Uses json core type for storage

json core type:
- Cross-database compatible (MySQL JSON, PostgreSQL JSONB)
- Used by filepath and object types

Two OAS regions remain:
- object: PK-addressed, DataJoint controlled
- content: hash-addressed, deduplicated

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 168 ++++++++++++-------
 1 file changed, 106 insertions(+), 62 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 7ca4522c6..b4b149628 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -4,17 +4,24 @@
 
 This document defines a layered storage architecture:
 
-1. **MySQL types**: `longblob`, `varchar`, `int`, etc.
-2. **Core DataJoint types**: `object`, `content`, `filepath` (and their `@store` variants)
+1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc.
+2. **Core DataJoint types**: `object`, `content`, `filepath`, `json` (and `@store` variants where applicable)
 3. **AttributeTypes**: `<djblob>`, `<xblob>`, `<attach>`, etc. (built on top of core types)
 
-### Three OAS Storage Regions
+### OAS Storage Regions
 
 | Region | Path Pattern | Addressing | Use Case |
 |--------|--------------|------------|----------|
 | Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 |
 | Content | `_content/{hash}` | Content hash | Deduplicated blobs/files |
-| Filepath | `_files/{user-path}` | User-defined | User-organized files |
+
+### External References
+
+`filepath` is **not** an OAS region - it's a general reference tracker for external resources:
+- OAS store paths: `store://main/experiment/data.h5`
+- URLs: `https://example.com/dataset.zip`
+- S3: `s3://bucket/key/file.nwb`
+- Any fsspec-compatible URI
 
 ## Core Types
 
@@ -55,11 +62,8 @@ store_root/
 ├── {schema}/{table}/{pk}/     # object storage (path-addressed by PK)
 │   └── {attribute}/
 │
-├── _content/                   # content storage (content-addressed)
-│   └── {hash[:2]}/{hash[2:4]}/{hash}
-│
-└── _files/                     # filepath storage (user-addressed)
-    └── {user-defined-path}
+└── _content/                   # content storage (content-addressed)
+    └── {hash[:2]}/{hash[2:4]}/{hash}
 ```
 
 #### Content Type Behavior
@@ -106,31 +110,41 @@ The `content` type stores a `char(64)` hash in the database:
 features CHAR(64) NOT NULL  -- SHA256 hex hash
 ```
 
-### `filepath` / `filepath@store` - User-Addressed Storage
+### `filepath` - External Reference Tracker
 
-**Upgraded from legacy.** User-defined path organization with ObjectRef access:
+**Upgraded from legacy.** General-purpose reference tracker for external resources:
 
-- **User controls paths**: relative path specified by user (not derived from PK or hash)
-- Stored in `_files/{user-path}` within the store
-- Returns `ObjectRef` for lazy access (no automatic copying)
-- Stores checksum in database for verification
-- Supports files and folders (like `object`)
+- **Not an OAS region**: references can point anywhere (URLs, S3, OAS stores, etc.)
+- **User controls URIs**: any fsspec-compatible URI
+- Returns `ObjectRef` for lazy access via fsspec
+- Stores optional checksum for verification
+- No integrity guarantees (external resources may change/disappear)
 
 ```python
 class RawData(dj.Manual):
     definition = """
     session_id : int
     ---
-    recording : filepath@raw      # user specifies path
+    recording : filepath      # external reference
     """
 
-# Insert - user provides relative path
+# Insert - user provides URI (various protocols)
 table.insert1({
     'session_id': 1,
-    'recording': 'experiment_001/session_001/data.nwb'
+    'recording': 's3://my-bucket/experiment_001/data.nwb'
+})
+# Or URL
+table.insert1({
+    'session_id': 2,
+    'recording': 'https://example.com/public/dataset.h5'
+})
+# Or OAS store reference
+table.insert1({
+    'session_id': 3,
+    'recording': 'store://main/custom/path/file.zarr'
 })
 
-# Fetch - returns ObjectRef (lazy, no copy)
+# Fetch - returns ObjectRef (lazy)
 row = (table & 'session_id=1').fetch1()
 ref = row['recording']           # ObjectRef
 ref.download('/local/path')      # explicit download
@@ -142,55 +156,82 @@ ref.open()                       # fsspec streaming access
 ```python
 # Core type behavior
 class FilepathType:
-    """Core user-addressed storage type."""
+    """Core external reference type."""
 
-    def store(self, user_path: str, store_backend) -> dict:
+    def store(self, uri: str, compute_checksum: bool = False) -> dict:
         """
-        Register filepath, return metadata.
-        File must already exist at _files/{user_path} in store.
+        Register external reference, return metadata.
+        Optionally compute checksum for verification.
         """
-        full_path = f"_files/{user_path}"
-        if not store_backend.exists(full_path):
-            raise FileNotFoundError(f"File not found: {full_path}")
+        metadata = {'uri': uri}
 
-        # Compute checksum for verification
-        checksum = store_backend.checksum(full_path)
-        size = store_backend.size(full_path)
+        if compute_checksum:
+            # Use fsspec to access and compute checksum
+            fs, path = fsspec.core.url_to_fs(uri)
+            if fs.exists(path):
+                metadata['checksum'] = compute_file_checksum(fs, path)
+                metadata['size'] = fs.size(path)
 
-        return {
-            'path': user_path,
-            'checksum': checksum,
-            'size': size
-        }
+        return metadata
 
-    def retrieve(self, metadata: dict, store_backend) -> ObjectRef:
+    def retrieve(self, metadata: dict) -> ObjectRef:
         """Return ObjectRef for lazy access."""
         return ObjectRef(
-            path=f"_files/{metadata['path']}",
-            store=store_backend,
-            checksum=metadata.get('checksum')  # for verification
+            uri=metadata['uri'],
+            checksum=metadata.get('checksum')  # optional verification
         )
 ```
 
 #### Database Column
 
-The `filepath` type stores JSON metadata:
+The `filepath` type uses the `json` core type:
 
 ```sql
--- filepath column
+-- filepath column (MySQL)
 recording JSON NOT NULL
--- Contains: {"path": "...", "checksum": "...", "size": ...}
+-- Contains: {"uri": "s3://...", "checksum": "...", "size": ...}
+
+-- filepath column (PostgreSQL)
+recording JSONB NOT NULL
 ```
 
+#### Supported URI Schemes
+
+| Scheme | Example | Backend |
+|--------|---------|---------|
+| `s3://` | `s3://bucket/key/file.nwb` | S3 via fsspec |
+| `gs://` | `gs://bucket/object` | Google Cloud Storage |
+| `https://` | `https://example.com/data.h5` | HTTP(S) |
+| `file://` | `file:///local/path/data.csv` | Local filesystem |
+| `store://` | `store://main/path/file.zarr` | OAS store |
+
 #### Key Differences from Legacy `filepath@store`
 
 | Feature | Legacy | New |
 |---------|--------|-----|
+| Location | OAS store only | Any URI (S3, HTTP, etc.) |
 | Access | Copy to local stage | ObjectRef (lazy) |
 | Copying | Automatic | Explicit via `ref.download()` |
 | Streaming | No | Yes via `ref.open()` |
-| Folders | No | Yes |
-| Interface | Returns local path | Returns ObjectRef |
+| Integrity | Managed by DataJoint | External (may change) |
+| Store param | Required (`@store`) | Optional (embedded in URI) |
+
+### `json` - Cross-Database JSON Type
+
+**New core type.** JSON storage compatible across MySQL and PostgreSQL:
+
+```sql
+-- MySQL
+column_name JSON NOT NULL
+
+-- PostgreSQL
+column_name JSONB NOT NULL
+```
+
+The `json` core type:
+- Stores arbitrary JSON-serializable data
+- Automatically uses appropriate type for database backend
+- Supports JSON path queries where available
 
 ## Parameterized AttributeTypes
 
@@ -337,11 +378,12 @@ class Attachments(dj.Manual):
 │  <djblob>   <xblob>   <attach>   <xattach>   <custom>             │
 ├───────────────────────────────────────────────────────────────────┤
 │                     Core DataJoint Types                           │
-│   longblob     content      object       filepath                  │
-│              content@s    object@s     filepath@s                  │
+│   longblob     content      object      filepath      json         │
+│              content@s    object@s                                 │
 ├───────────────────────────────────────────────────────────────────┤
-│                        MySQL Types                                 │
-│   LONGBLOB     CHAR(64)      JSON       JSON      VARCHAR   etc.  │
+│                    Database Types                                  │
+│   LONGBLOB     CHAR(64)     JSON      JSON/JSONB    VARCHAR  etc. │
+│                           (MySQL)    (PostgreSQL)                  │
 └───────────────────────────────────────────────────────────────────┘
 ```
 
@@ -357,7 +399,7 @@ class Attachments(dj.Manual):
 | `<xattach@s>` | `content@s` | `_content/{hash}` | Yes | Local file path |
 | `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
 | `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `filepath@s` | — | `_files/{user-path}` | No | ObjectRef |
+| `filepath` | `json` | External (any URI) | No | ObjectRef |
 
 ## Reference Counting for Content Type
 
@@ -408,33 +450,35 @@ def garbage_collect(project):
 
 | Feature | `object` | `content` | `filepath` |
 |---------|----------|-----------|------------|
-| Addressing | Primary key | Content hash | User-defined path |
+| Location | OAS store | OAS store | Anywhere (URI) |
+| Addressing | Primary key | Content hash | User URI |
 | Path control | DataJoint | DataJoint | User |
 | Deduplication | No | Yes | No |
-| Structure | Files, folders, Zarr | Single blob only | Files, folders |
+| Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) |
 | Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) |
-| GC | Deleted with row | Reference counted | Deleted with row |
-| Checksum | Optional | Implicit (is the hash) | Stored in DB |
+| GC | Deleted with row | Reference counted | N/A (external) |
+| Integrity | DataJoint managed | DataJoint managed | External (no guarantees) |
 
 **When to use each:**
 - **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5)
 - **`content`**: Deduplicated serialized data or file attachments via `<xblob>`, `<xattach>`
-- **`filepath`**: User-managed file organization, external data sources
+- **`filepath`**: External references (S3, URLs, etc.) not managed by DataJoint
 
 ## Key Design Decisions
 
-1. **Layered architecture**: Core types (`object`, `content`, `filepath`) separate from AttributeTypes
-2. **Three OAS regions**: object (PK-addressed), content (hash-addressed), filepath (user-addressed)
-3. **Content type**: Single-blob, content-addressed, deduplicated storage
-4. **Filepath upgrade**: Returns ObjectRef (lazy) instead of copying files
-5. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
-6. **Naming convention**:
+1. **Layered architecture**: Core types (`object`, `content`, `filepath`, `json`) separate from AttributeTypes
+2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
+3. **Filepath as reference tracker**: Not an OAS region - tracks external URIs (S3, HTTP, etc.)
+4. **Content type**: Single-blob, content-addressed, deduplicated storage
+5. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB)
+6. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
+7. **Naming convention**:
    - `<djblob>` = internal serialized (database)
    - `<xblob>` = external serialized (content-addressed)
    - `<attach>` = internal file (single file)
    - `<xattach>` = external file (single file)
-7. **Transparent access**: AttributeTypes return Python objects or file paths
-8. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
+8. **Transparent access**: AttributeTypes return Python objects or file paths
+9. **Lazy access**: `object`, `object@store`, and `filepath` return ObjectRef
 
 ## Migration from Legacy Types
 

From 43c1999c6792600659bfd55b79501e0323fc7604 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:19:43 +0000
Subject: [PATCH 19/41] Simplify filepath to filepath@store with relative paths
 for portability

- Remove general URI tracker concept from filepath
- filepath@store now requires a store parameter and uses relative paths
- Key benefit: portability across environments by changing store config
- For arbitrary URLs, recommend using varchar (simpler, more transparent)
- Add comparison table for filepath@store vs varchar use cases
- Update all diagrams and tables to reflect the change

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 130 +++++++++----------
 1 file changed, 60 insertions(+), 70 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index b4b149628..f34d1b84a 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -17,11 +17,8 @@ This document defines a layered storage architecture:
 
 ### External References
 
-`filepath` is **not** an OAS region - it's a general reference tracker for external resources:
-- OAS store paths: `store://main/experiment/data.h5`
-- URLs: `https://example.com/dataset.zip`
-- S3: `s3://bucket/key/file.nwb`
-- Any fsspec-compatible URI
+`filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access.
+For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead.
 
 ## Core Types
 
@@ -110,38 +107,31 @@ The `content` type stores a `char(64)` hash in the database:
 features CHAR(64) NOT NULL  -- SHA256 hex hash
 ```
 
-### `filepath` - External Reference Tracker
+### `filepath@store` - Portable External Reference
 
-**Upgraded from legacy.** General-purpose reference tracker for external resources:
+**Upgraded from legacy.** Relative path references within configured stores:
 
-- **Not an OAS region**: references can point anywhere (URLs, S3, OAS stores, etc.)
-- **User controls URIs**: any fsspec-compatible URI
+- **Relative paths**: paths within a configured store (portable across environments)
+- **Store-aware**: resolves paths against configured store backend
 - Returns `ObjectRef` for lazy access via fsspec
 - Stores optional checksum for verification
-- No integrity guarantees (external resources may change/disappear)
+
+**Key benefit**: Portability. The path is relative to the store, so pipelines can be moved
+between environments (dev → prod, cloud → local) by changing store configuration without
+updating data.
 
 ```python
 class RawData(dj.Manual):
     definition = """
     session_id : int
     ---
-    recording : filepath      # external reference
+    recording : filepath@main      # relative path within 'main' store
     """
 
-# Insert - user provides URI (various protocols)
+# Insert - user provides relative path within the store
 table.insert1({
     'session_id': 1,
-    'recording': 's3://my-bucket/experiment_001/data.nwb'
-})
-# Or URL
-table.insert1({
-    'session_id': 2,
-    'recording': 'https://example.com/public/dataset.h5'
-})
-# Or OAS store reference
-table.insert1({
-    'session_id': 3,
-    'recording': 'store://main/custom/path/file.zarr'
+    'recording': 'experiment_001/data.nwb'  # relative to main store root
 })
 
 # Fetch - returns ObjectRef (lazy)
@@ -151,33 +141,43 @@ ref.download('/local/path')      # explicit download
 ref.open()                       # fsspec streaming access
 ```
 
+#### When to Use `filepath@store` vs `varchar`
+
+| Use Case | Recommended Type |
+|----------|------------------|
+| Need ObjectRef/lazy access | `filepath@store` |
+| Need portability (relative paths) | `filepath@store` |
+| Want checksum verification | `filepath@store` |
+| Just storing a URL string | `varchar` |
+| External URLs you don't control | `varchar` |
+
+For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics,
+just use `varchar`. A string is simpler and more transparent.
+
 #### Filepath Type Behavior
 
 ```python
 # Core type behavior
 class FilepathType:
-    """Core external reference type."""
+    """Core external reference type with store-relative paths."""
 
-    def store(self, uri: str, compute_checksum: bool = False) -> dict:
-        """
-        Register external reference, return metadata.
-        Optionally compute checksum for verification.
-        """
-        metadata = {'uri': uri}
+    def store(self, relative_path: str, store_backend, compute_checksum: bool = False) -> dict:
+        """Register reference to file in store."""
+        metadata = {'path': relative_path}
 
         if compute_checksum:
-            # Use fsspec to access and compute checksum
-            fs, path = fsspec.core.url_to_fs(uri)
-            if fs.exists(path):
-                metadata['checksum'] = compute_file_checksum(fs, path)
-                metadata['size'] = fs.size(path)
+            full_path = store_backend.resolve(relative_path)
+            if store_backend.exists(full_path):
+                metadata['checksum'] = compute_file_checksum(store_backend, full_path)
+                metadata['size'] = store_backend.size(full_path)
 
         return metadata
 
-    def retrieve(self, metadata: dict) -> ObjectRef:
+    def retrieve(self, metadata: dict, store_backend) -> ObjectRef:
         """Return ObjectRef for lazy access."""
         return ObjectRef(
-            uri=metadata['uri'],
+            store=store_backend,
+            path=metadata['path'],
             checksum=metadata.get('checksum')  # optional verification
         )
 ```
@@ -189,32 +189,21 @@ The `filepath` type uses the `json` core type:
 ```sql
 -- filepath column (MySQL)
 recording JSON NOT NULL
--- Contains: {"uri": "s3://...", "checksum": "...", "size": ...}
+-- Contains: {"path": "experiment_001/data.nwb", "checksum": "...", "size": ...}
 
 -- filepath column (PostgreSQL)
 recording JSONB NOT NULL
 ```
 
-#### Supported URI Schemes
-
-| Scheme | Example | Backend |
-|--------|---------|---------|
-| `s3://` | `s3://bucket/key/file.nwb` | S3 via fsspec |
-| `gs://` | `gs://bucket/object` | Google Cloud Storage |
-| `https://` | `https://example.com/data.h5` | HTTP(S) |
-| `file://` | `file:///local/path/data.csv` | Local filesystem |
-| `store://` | `store://main/path/file.zarr` | OAS store |
-
 #### Key Differences from Legacy `filepath@store`
 
 | Feature | Legacy | New |
 |---------|--------|-----|
-| Location | OAS store only | Any URI (S3, HTTP, etc.) |
 | Access | Copy to local stage | ObjectRef (lazy) |
 | Copying | Automatic | Explicit via `ref.download()` |
 | Streaming | No | Yes via `ref.open()` |
-| Integrity | Managed by DataJoint | External (may change) |
-| Store param | Required (`@store`) | Optional (embedded in URI) |
+| Paths | Relative | Relative (unchanged) |
+| Store param | Required (`@store`) | Required (`@store`) |
 
 ### `json` - Cross-Database JSON Type
 
@@ -378,7 +367,7 @@ class Attachments(dj.Manual):
 │  <djblob>   <xblob>   <attach>   <xattach>   <custom>             │
 ├───────────────────────────────────────────────────────────────────┤
 │                     Core DataJoint Types                           │
-│   longblob     content      object      filepath      json         │
+│   longblob     content      object    filepath@s      json         │
 │              content@s    object@s                                 │
 ├───────────────────────────────────────────────────────────────────┤
 │                    Database Types                                  │
@@ -399,7 +388,7 @@ class Attachments(dj.Manual):
 | `<xattach@s>` | `content@s` | `_content/{hash}` | Yes | Local file path |
 | `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
 | `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `filepath` | `json` | External (any URI) | No | ObjectRef |
+| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef |
 
 ## Reference Counting for Content Type
 
@@ -448,37 +437,39 @@ def garbage_collect(project):
 
 ## Core Type Comparison
 
-| Feature | `object` | `content` | `filepath` |
-|---------|----------|-----------|------------|
-| Location | OAS store | OAS store | Anywhere (URI) |
-| Addressing | Primary key | Content hash | User URI |
+| Feature | `object` | `content` | `filepath@store` |
+|---------|----------|-----------|------------------|
+| Location | OAS store | OAS store | Configured store |
+| Addressing | Primary key | Content hash | Relative path |
 | Path control | DataJoint | DataJoint | User |
 | Deduplication | No | Yes | No |
 | Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) |
 | Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) |
-| GC | Deleted with row | Reference counted | N/A (external) |
-| Integrity | DataJoint managed | DataJoint managed | External (no guarantees) |
+| GC | Deleted with row | Reference counted | N/A (user managed) |
+| Integrity | DataJoint managed | DataJoint managed | User managed |
 
 **When to use each:**
 - **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5)
 - **`content`**: Deduplicated serialized data or file attachments via `<xblob>`, `<xattach>`
-- **`filepath`**: External references (S3, URLs, etc.) not managed by DataJoint
+- **`filepath@store`**: Portable references to files in configured stores
+- **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed
 
 ## Key Design Decisions
 
-1. **Layered architecture**: Core types (`object`, `content`, `filepath`, `json`) separate from AttributeTypes
+1. **Layered architecture**: Core types (`object`, `content`, `filepath@store`, `json`) separate from AttributeTypes
 2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
-3. **Filepath as reference tracker**: Not an OAS region - tracks external URIs (S3, HTTP, etc.)
-4. **Content type**: Single-blob, content-addressed, deduplicated storage
-5. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB)
-6. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
-7. **Naming convention**:
+3. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
+4. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
+5. **Content type**: Single-blob, content-addressed, deduplicated storage
+6. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB)
+7. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
+8. **Naming convention**:
    - `<djblob>` = internal serialized (database)
    - `<xblob>` = external serialized (content-addressed)
    - `<attach>` = internal file (single file)
    - `<xattach>` = external file (single file)
-8. **Transparent access**: AttributeTypes return Python objects or file paths
-9. **Lazy access**: `object`, `object@store`, and `filepath` return ObjectRef
+9. **Transparent access**: AttributeTypes return Python objects or file paths
+10. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
 
 ## Migration from Legacy Types
 
@@ -550,5 +541,4 @@ def migrate_external_store(schema, store_name):
 
 1. Should `content` without `@store` use a default store, or require explicit store?
 2. Should we support `<xblob>` without `@store` syntax (implying default store)?
-3. Should `filepath` without `@store` be supported (using default store)?
-4. How long should the backward compatibility layer support legacy `~external_*` format?
+3. How long should the backward compatibility layer support legacy `~external_*` format?

From b9b6e34f9196b9c082fb6df37e4058befddd02d1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:25:46 +0000
Subject: [PATCH 20/41] Simplify to two-layer architecture: database types +
 AttributeTypes

- Remove "core types" concept - all storage types are now AttributeTypes
- Built-in AttributeTypes (object, content, filepath@store) use json dtype
- JSON stores metadata: path, hash, store name, size, etc.
- User-defined AttributeTypes can compose built-in ones (e.g., <xblob> uses content)
- Clearer separation: database types (json, longblob) vs AttributeTypes (encode/decode)

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 226 +++++++++++--------
 1 file changed, 137 insertions(+), 89 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index f34d1b84a..32083a88e 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -2,11 +2,14 @@
 
 ## Overview
 
-This document defines a layered storage architecture:
+This document defines a two-layer storage architecture:
 
-1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc.
-2. **Core DataJoint types**: `object`, `content`, `filepath`, `json` (and `@store` variants where applicable)
-3. **AttributeTypes**: `<djblob>`, `<xblob>`, `<attach>`, etc. (built on top of core types)
+1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. (MySQL/PostgreSQL native)
+2. **AttributeTypes**: Custom types with `encode()`/`decode()` semantics
+
+All DataJoint storage types (`object`, `content`, `filepath@store`, `<djblob>`, etc.) are
+implemented as **AttributeTypes**. Some are built-in (auto-registered, use `dj.config` for stores)
+while others are user-defined.
 
 ### OAS Storage Regions
 
@@ -20,17 +23,21 @@ This document defines a layered storage architecture:
 `filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access.
 For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead.
 
-## Core Types
+## Built-in AttributeTypes
+
+Built-in types are auto-registered and use `dj.config['stores']` for store configuration.
+They use `json` as their database dtype to store metadata.
 
 ### `object` / `object@store` - Path-Addressed Storage
 
-**Already implemented.** OAS (Object-Augmented Schema) storage:
+**Built-in AttributeType.** OAS (Object-Augmented Schema) storage:
 
 - Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/`
 - One-to-one relationship with table row
 - Deleted when row is deleted
 - Returns `ObjectRef` for lazy access
 - Supports direct writes (Zarr, HDF5) via fsspec
+- **dtype**: `json` (stores path, store name, metadata)
 
 ```python
 class Analysis(dj.Computed):
@@ -42,9 +49,34 @@ class Analysis(dj.Computed):
     """
 ```
 
+#### Implementation
+
+```python
+class ObjectType(AttributeType):
+    """Built-in AttributeType for path-addressed OAS storage."""
+    type_name = "object"
+    dtype = "json"
+
+    def encode(self, value, *, key=None, store_name=None) -> dict:
+        store = get_store(store_name or dj.config['stores']['default'])
+        path = self._compute_path(key)  # {schema}/{table}/{pk}/{attr}/
+        store.put(path, value)
+        return {
+            "path": path,
+            "store": store_name,
+            # Additional metadata (size, timestamps, etc.)
+        }
+
+    def decode(self, stored: dict, *, key=None) -> ObjectRef:
+        return ObjectRef(
+            store=get_store(stored["store"]),
+            path=stored["path"]
+        )
+```
+
 ### `content` / `content@store` - Content-Addressed Storage
 
-**New core type.** Content-addressed storage with deduplication:
+**Built-in AttributeType.** Content-addressed storage with deduplication:
 
 - **Single blob only**: stores a single file or serialized object (not folders)
 - **Per-project scope**: content is shared across all schemas in a project (not per-schema)
@@ -53,6 +85,7 @@ class Analysis(dj.Computed):
 - Reference counted for garbage collection
 - Deduplication: identical content stored once across the entire project
 - For folders/complex objects, use `object` type instead
+- **dtype**: `json` (stores hash, store name, size, metadata)
 
 ```
 store_root/
@@ -63,58 +96,63 @@ store_root/
     └── {hash[:2]}/{hash[2:4]}/{hash}
 ```
 
-#### Content Type Behavior
-
-The `content` core type:
-- Accepts `bytes` on insert
-- Computes SHA256 hash of the content
-- Stores in `_content/{hash}/` if not already present (deduplication)
-- Returns `bytes` on fetch (transparent retrieval)
-- Registers in `ContentRegistry` for GC tracking
+#### Implementation
 
 ```python
-# Core type behavior (built-in, not an AttributeType)
-class ContentType:
-    """Core content-addressed storage type."""
+class ContentType(AttributeType):
+    """Built-in AttributeType for content-addressed storage."""
+    type_name = "content"
+    dtype = "json"
 
-    def store(self, data: bytes, store_backend) -> str:
-        """Store content, return hash."""
+    def encode(self, data: bytes, *, key=None, store_name=None) -> dict:
+        """Store content, return metadata as JSON."""
         content_hash = hashlib.sha256(data).hexdigest()
+        store = get_store(store_name or dj.config['stores']['default'])
         path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
 
-        if not store_backend.exists(path):
-            store_backend.put(path, data)
+        if not store.exists(path):
+            store.put(path, data)
             ContentRegistry().insert1({
                 'content_hash': content_hash,
-                'store': store_backend.name,
+                'store': store_name,
                 'size': len(data)
-            })
+            }, skip_duplicates=True)
 
-        return content_hash
+        return {
+            "hash": content_hash,
+            "store": store_name,
+            "size": len(data)
+        }
 
-    def retrieve(self, content_hash: str, store_backend) -> bytes:
+    def decode(self, stored: dict, *, key=None) -> bytes:
         """Retrieve content by hash."""
-        path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
-        return store_backend.get(path)
+        store = get_store(stored["store"])
+        path = f"_content/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}"
+        return store.get(path)
 ```
 
 #### Database Column
 
-The `content` type stores a `char(64)` hash in the database:
+The `content` type stores JSON metadata:
 
 ```sql
--- content column
-features CHAR(64) NOT NULL  -- SHA256 hex hash
+-- content column (MySQL)
+features JSON NOT NULL
+-- Contains: {"hash": "abc123...", "store": "main", "size": 12345}
+
+-- content column (PostgreSQL)
+features JSONB NOT NULL
 ```
 
 ### `filepath@store` - Portable External Reference
 
-**Upgraded from legacy.** Relative path references within configured stores:
+**Built-in AttributeType.** Relative path references within configured stores:
 
 - **Relative paths**: paths within a configured store (portable across environments)
 - **Store-aware**: resolves paths against configured store backend
 - Returns `ObjectRef` for lazy access via fsspec
 - Stores optional checksum for verification
+- **dtype**: `json` (stores path, store name, checksum, metadata)
 
 **Key benefit**: Portability. The path is relative to the store, so pipelines can be moved
 between environments (dev → prod, cloud → local) by changing store configuration without
@@ -154,42 +192,43 @@ ref.open()                       # fsspec streaming access
 For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics,
 just use `varchar`. A string is simpler and more transparent.
 
-#### Filepath Type Behavior
+#### Implementation
 
 ```python
-# Core type behavior
-class FilepathType:
-    """Core external reference type with store-relative paths."""
+class FilepathType(AttributeType):
+    """Built-in AttributeType for store-relative file references."""
+    type_name = "filepath"
+    dtype = "json"
 
-    def store(self, relative_path: str, store_backend, compute_checksum: bool = False) -> dict:
+    def encode(self, relative_path: str, *, key=None, store_name=None,
+               compute_checksum: bool = False) -> dict:
         """Register reference to file in store."""
-        metadata = {'path': relative_path}
+        store = get_store(store_name)  # store_name required for filepath
+        metadata = {'path': relative_path, 'store': store_name}
 
         if compute_checksum:
-            full_path = store_backend.resolve(relative_path)
-            if store_backend.exists(full_path):
-                metadata['checksum'] = compute_file_checksum(store_backend, full_path)
-                metadata['size'] = store_backend.size(full_path)
+            full_path = store.resolve(relative_path)
+            if store.exists(full_path):
+                metadata['checksum'] = compute_file_checksum(store, full_path)
+                metadata['size'] = store.size(full_path)
 
         return metadata
 
-    def retrieve(self, metadata: dict, store_backend) -> ObjectRef:
+    def decode(self, stored: dict, *, key=None) -> ObjectRef:
         """Return ObjectRef for lazy access."""
         return ObjectRef(
-            store=store_backend,
-            path=metadata['path'],
-            checksum=metadata.get('checksum')  # optional verification
+            store=get_store(stored['store']),
+            path=stored['path'],
+            checksum=stored.get('checksum')  # optional verification
         )
 ```
 
 #### Database Column
 
-The `filepath` type uses the `json` core type:
-
 ```sql
 -- filepath column (MySQL)
 recording JSON NOT NULL
--- Contains: {"path": "experiment_001/data.nwb", "checksum": "...", "size": ...}
+-- Contains: {"path": "experiment_001/data.nwb", "store": "main", "checksum": "...", "size": ...}
 
 -- filepath column (PostgreSQL)
 recording JSONB NOT NULL
@@ -205,49 +244,52 @@ recording JSONB NOT NULL
 | Paths | Relative | Relative (unchanged) |
 | Store param | Required (`@store`) | Required (`@store`) |
 
+## Database Types
+
 ### `json` - Cross-Database JSON Type
 
-**New core type.** JSON storage compatible across MySQL and PostgreSQL:
+JSON storage compatible across MySQL and PostgreSQL:
 
 ```sql
 -- MySQL
 column_name JSON NOT NULL
 
--- PostgreSQL
+-- PostgreSQL (uses JSONB for better indexing)
 column_name JSONB NOT NULL
 ```
 
-The `json` core type:
+The `json` database type:
+- Used as dtype by built-in AttributeTypes (`object`, `content`, `filepath@store`)
 - Stores arbitrary JSON-serializable data
 - Automatically uses appropriate type for database backend
 - Supports JSON path queries where available
 
 ## Parameterized AttributeTypes
 
-AttributeTypes can be parameterized with `<type@param>` syntax. The parameter is passed
-through to the underlying dtype:
+AttributeTypes can be parameterized with `<type@param>` syntax. The parameter specifies
+which store to use:
 
 ```python
 class AttributeType:
-    type_name: str      # Name used in <brackets>
-    dtype: str          # Base underlying type
+    type_name: str      # Name used in <brackets> or as bare type
+    dtype: str          # Database type or built-in AttributeType
 
-    # When user writes <type_name@param>, resolved dtype becomes:
-    # f"{dtype}@{param}" if param specified, else dtype
+    # When user writes type_name@param, resolved store becomes param
 ```
 
 **Resolution examples:**
 ```
-<xblob>       → dtype = "content"       → default store
-<xblob@cold>  → dtype = "content@cold"  → cold store
-<djblob>      → dtype = "longblob"      → database
-<djblob@x>    → ERROR: longblob doesn't support parameters
+<xblob>       → uses content type   → default store
+<xblob@cold>  → uses content type   → cold store
+<djblob>      → dtype = "longblob"  → database (no store)
+object@cold   → uses object type    → cold store
 ```
 
-This means `<xblob>` and `<xblob@store>` share the same AttributeType class - the
-parameter flows through to the core type, which validates whether it supports `@store`.
+AttributeTypes can use other AttributeTypes as their dtype (composition):
+- `<xblob>` uses `content` - adds djblob serialization on top of content-addressed storage
+- `<xattach>` uses `content` - adds filename preservation on top of content-addressed storage
 
-## AttributeTypes (Built on Core Types)
+## User-Defined AttributeTypes
 
 ### `<djblob>` - Internal Serialized Blob
 
@@ -364,31 +406,35 @@ class Attachments(dj.Manual):
 ```
 ┌───────────────────────────────────────────────────────────────────┐
 │                        AttributeTypes                              │
-│  <djblob>   <xblob>   <attach>   <xattach>   <custom>             │
+│                                                                    │
+│  Built-in:   object    content    filepath@s                       │
+│  User:       <djblob>  <xblob>    <attach>   <xattach>  <custom>  │
 ├───────────────────────────────────────────────────────────────────┤
-│                     Core DataJoint Types                           │
-│   longblob     content      object    filepath@s      json         │
-│              content@s    object@s                                 │
-├───────────────────────────────────────────────────────────────────┤
-│                    Database Types                                  │
-│   LONGBLOB     CHAR(64)     JSON      JSON/JSONB    VARCHAR  etc. │
-│                           (MySQL)    (PostgreSQL)                  │
+│                    Database Types (dtype)                          │
+│                                                                    │
+│          LONGBLOB      JSON/JSONB      VARCHAR      INT   etc.    │
 └───────────────────────────────────────────────────────────────────┘
 ```
 
+All storage types are AttributeTypes:
+- **Built-in**: `object`, `content`, `filepath@store` - auto-registered, use `dj.config`
+- **User-defined**: `<djblob>`, `<xblob>`, `<attach>`, `<xattach>`, `<custom>` - registered via `@dj.register_type`
+
 ## Storage Comparison
 
-| Type | Core Type | Storage Location | Dedup | Returns |
-|------|-----------|------------------|-------|---------|
+| Type | dtype | Storage Location | Dedup | Returns |
+|------|-------|------------------|-------|---------|
+| `object` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `object@s` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `content` | `json` | `_content/{hash}` | Yes | bytes |
+| `content@s` | `json` | `_content/{hash}` | Yes | bytes |
+| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef |
 | `<djblob>` | `longblob` | Database | No | Python object |
 | `<xblob>` | `content` | `_content/{hash}` | Yes | Python object |
 | `<xblob@s>` | `content@s` | `_content/{hash}` | Yes | Python object |
 | `<attach>` | `longblob` | Database | No | Local file path |
 | `<xattach>` | `content` | `_content/{hash}` | Yes | Local file path |
 | `<xattach@s>` | `content@s` | `_content/{hash}` | Yes | Local file path |
-| `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef |
 
 ## Reference Counting for Content Type
 
@@ -435,10 +481,11 @@ def garbage_collect(project):
         (ContentRegistry() & {'content_hash': content_hash}).delete()
 ```
 
-## Core Type Comparison
+## Built-in AttributeType Comparison
 
 | Feature | `object` | `content` | `filepath@store` |
 |---------|----------|-----------|------------------|
+| dtype | `json` | `json` | `json` |
 | Location | OAS store | OAS store | Configured store |
 | Addressing | Primary key | Content hash | Relative path |
 | Path control | DataJoint | DataJoint | User |
@@ -456,20 +503,21 @@ def garbage_collect(project):
 
 ## Key Design Decisions
 
-1. **Layered architecture**: Core types (`object`, `content`, `filepath@store`, `json`) separate from AttributeTypes
-2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
-3. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
-4. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
-5. **Content type**: Single-blob, content-addressed, deduplicated storage
-6. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB)
-7. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
-8. **Naming convention**:
+1. **Two-layer architecture**: Database types (`json`, `longblob`, etc.) and AttributeTypes
+2. **All storage types are AttributeTypes**: Built-in (`object`, `content`, `filepath@store`) and user-defined (`<djblob>`, etc.)
+3. **Built-in types use JSON dtype**: Stores metadata (path, hash, store name, etc.) in JSON columns
+4. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
+5. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
+6. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
+7. **Content type**: Single-blob, content-addressed, deduplicated storage
+8. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
+9. **Naming convention**:
    - `<djblob>` = internal serialized (database)
    - `<xblob>` = external serialized (content-addressed)
    - `<attach>` = internal file (single file)
    - `<xattach>` = external file (single file)
-9. **Transparent access**: AttributeTypes return Python objects or file paths
-10. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
+10. **Transparent access**: AttributeTypes return Python objects or file paths
+11. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
 
 ## Migration from Legacy Types
 

From 2a5d161fe8dccfa5475d89563bcc877c1183ccb1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:32:42 +0000
Subject: [PATCH 21/41] Add three-layer type architecture with core DataJoint
 types

Layer 1: Native database types (FLOAT, TINYINT, etc.) - backend-specific, discouraged
Layer 2: Core DataJoint types (float32, uint8, bool, json) - standardized, scientist-friendly
Layer 3: AttributeTypes (object, content, <djblob>, etc.) - encode/decode, composable

Core types provide:
- Consistent interface across MySQL and PostgreSQL
- Scientist-friendly names (float32 vs FLOAT, uint8 vs TINYINT UNSIGNED)
- Automatic backend translation

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 146 +++++++++++++------
 1 file changed, 103 insertions(+), 43 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 32083a88e..0d4223a96 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -2,14 +2,31 @@
 
 ## Overview
 
-This document defines a two-layer storage architecture:
+This document defines a three-layer type architecture:
 
-1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. (MySQL/PostgreSQL native)
-2. **AttributeTypes**: Custom types with `encode()`/`decode()` semantics
+1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use.
+2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`).
+3. **AttributeTypes** - Programmatic types with `encode()`/`decode()` semantics. Composable.
 
-All DataJoint storage types (`object`, `content`, `filepath@store`, `<djblob>`, etc.) are
-implemented as **AttributeTypes**. Some are built-in (auto-registered, use `dj.config` for stores)
-while others are user-defined.
+```
+┌───────────────────────────────────────────────────────────────────┐
+│                     AttributeTypes (Layer 3)                       │
+│                                                                    │
+│  Built-in:   object    content    filepath@s    <djblob>  <xblob> │
+│  User:       <custom>  <mytype>   ...                              │
+├───────────────────────────────────────────────────────────────────┤
+│                 Core DataJoint Types (Layer 2)                     │
+│                                                                    │
+│  int8  int16  int32  int64   float32  float64   bool   decimal    │
+│  uint8 uint16 uint32 uint64  varchar  char      uuid   date       │
+│  json  longblob  blob  timestamp  datetime  enum                   │
+├───────────────────────────────────────────────────────────────────┤
+│               Native Database Types (Layer 1)                      │
+│                                                                    │
+│  MySQL:      TINYINT  SMALLINT  INT  BIGINT  FLOAT  DOUBLE  ...   │
+│  PostgreSQL: SMALLINT INTEGER   BIGINT  REAL  DOUBLE PRECISION    │
+└───────────────────────────────────────────────────────────────────┘
+```
 
 ### OAS Storage Regions
 
@@ -23,10 +40,68 @@ while others are user-defined.
 `filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access.
 For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead.
 
-## Built-in AttributeTypes
+## Core DataJoint Types (Layer 2)
+
+Core types provide a standardized, scientist-friendly interface that works identically across
+MySQL and PostgreSQL backends. Users should prefer these over native database types.
+
+### Numeric Types
+
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` (clamped) |
+| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` |
+| `int32` | 32-bit signed | `INT` | `INTEGER` |
+| `int64` | 64-bit signed | `BIGINT` | `BIGINT` |
+| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` (checked) |
+| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` (checked) |
+| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` (checked) |
+| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` |
+| `float32` | 32-bit float | `FLOAT` | `REAL` |
+| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` |
+| `decimal(p,s)` | Fixed precision | `DECIMAL(p,s)` | `NUMERIC(p,s)` |
+
+### String Types
+
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` |
+| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` |
+
+### Boolean
+
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `bool` | True/False | `TINYINT(1)` | `BOOLEAN` |
+
+### Date/Time Types
 
-Built-in types are auto-registered and use `dj.config['stores']` for store configuration.
-They use `json` as their database dtype to store metadata.
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `date` | Date only | `DATE` | `DATE` |
+| `datetime` | Date and time | `DATETIME(6)` | `TIMESTAMP` |
+| `timestamp` | Auto-updating | `TIMESTAMP` | `TIMESTAMP` |
+| `time` | Time only | `TIME` | `TIME` |
+
+### Binary Types
+
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `blob` | Binary up to 64KB | `BLOB` | `BYTEA` |
+| `longblob` | Binary up to 4GB | `LONGBLOB` | `BYTEA` |
+
+### Special Types
+
+| Core Type | Description | MySQL | PostgreSQL |
+|-----------|-------------|-------|------------|
+| `json` | JSON document | `JSON` | `JSONB` |
+| `uuid` | UUID | `CHAR(36)` | `UUID` |
+| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + CHECK |
+
+## AttributeTypes (Layer 3)
+
+AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are
+composable and can be built-in or user-defined.
 
 ### `object` / `object@store` - Path-Addressed Storage
 
@@ -401,25 +476,6 @@ class Attachments(dj.Manual):
     """
 ```
 
-## Type Layering Summary
-
-```
-┌───────────────────────────────────────────────────────────────────┐
-│                        AttributeTypes                              │
-│                                                                    │
-│  Built-in:   object    content    filepath@s                       │
-│  User:       <djblob>  <xblob>    <attach>   <xattach>  <custom>  │
-├───────────────────────────────────────────────────────────────────┤
-│                    Database Types (dtype)                          │
-│                                                                    │
-│          LONGBLOB      JSON/JSONB      VARCHAR      INT   etc.    │
-└───────────────────────────────────────────────────────────────────┘
-```
-
-All storage types are AttributeTypes:
-- **Built-in**: `object`, `content`, `filepath@store` - auto-registered, use `dj.config`
-- **User-defined**: `<djblob>`, `<xblob>`, `<attach>`, `<xattach>`, `<custom>` - registered via `@dj.register_type`
-
 ## Storage Comparison
 
 | Type | dtype | Storage Location | Dedup | Returns |
@@ -503,21 +559,25 @@ def garbage_collect(project):
 
 ## Key Design Decisions
 
-1. **Two-layer architecture**: Database types (`json`, `longblob`, etc.) and AttributeTypes
-2. **All storage types are AttributeTypes**: Built-in (`object`, `content`, `filepath@store`) and user-defined (`<djblob>`, etc.)
-3. **Built-in types use JSON dtype**: Stores metadata (path, hash, store name, etc.) in JSON columns
-4. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
-5. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
-6. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
-7. **Content type**: Single-blob, content-addressed, deduplicated storage
-8. **Parameterized types**: `<type@param>` passes parameter to underlying dtype
-9. **Naming convention**:
-   - `<djblob>` = internal serialized (database)
-   - `<xblob>` = external serialized (content-addressed)
-   - `<attach>` = internal file (single file)
-   - `<xattach>` = external file (single file)
-10. **Transparent access**: AttributeTypes return Python objects or file paths
-11. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
+1. **Three-layer architecture**:
+   - Layer 1: Native database types (backend-specific, discouraged)
+   - Layer 2: Core DataJoint types (standardized, scientist-friendly)
+   - Layer 3: AttributeTypes (encode/decode, composable)
+2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)`
+3. **AttributeTypes are composable**: `<xblob>` uses `content`, which uses `json`
+4. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.)
+5. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
+6. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
+7. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
+8. **Content type**: Single-blob, content-addressed, deduplicated storage
+9. **Parameterized types**: `type@param` passes store parameter
+10. **Naming convention**:
+    - `<djblob>` = internal serialized (database)
+    - `<xblob>` = external serialized (content-addressed)
+    - `<attach>` = internal file (single file)
+    - `<xattach>` = external file (single file)
+11. **Transparent access**: AttributeTypes return Python objects or file paths
+12. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
 
 ## Migration from Legacy Types
 

From d36739dac2a1e8e95dabbe4420c7c5bd332200ed Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:37:03 +0000
Subject: [PATCH 22/41] Use angle brackets for all AttributeTypes in
 definitions

All AttributeTypes (Layer 3) now use angle bracket syntax in table definitions:
- Core types (Layer 2): int32, float64, varchar(255) - no brackets
- AttributeTypes (Layer 3): <object>, <djblob>, <filepath@main> - angle brackets

This clear visual distinction helps users immediately identify:
- Core types: direct database mapping
- AttributeTypes: encode/decode transformation

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md | 99 ++++++++++----------
 1 file changed, 52 insertions(+), 47 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 0d4223a96..3d70c908e 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -12,8 +12,8 @@ This document defines a three-layer type architecture:
 ┌───────────────────────────────────────────────────────────────────┐
 │                     AttributeTypes (Layer 3)                       │
 │                                                                    │
-│  Built-in:   object    content    filepath@s    <djblob>  <xblob> │
-│  User:       <custom>  <mytype>   ...                              │
+│  Built-in:  <object>  <content>  <filepath@s>  <djblob>  <xblob>  │
+│  User:      <custom>  <mytype>   ...                               │
 ├───────────────────────────────────────────────────────────────────┤
 │                 Core DataJoint Types (Layer 2)                     │
 │                                                                    │
@@ -28,6 +28,10 @@ This document defines a three-layer type architecture:
 └───────────────────────────────────────────────────────────────────┘
 ```
 
+**Syntax distinction:**
+- Core types: `int32`, `float64`, `varchar(255)` - no brackets
+- AttributeTypes: `<object>`, `<djblob>`, `<filepath@main>` - angle brackets
+
 ### OAS Storage Regions
 
 | Region | Path Pattern | Addressing | Use Case |
@@ -37,7 +41,7 @@ This document defines a three-layer type architecture:
 
 ### External References
 
-`filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access.
+`<filepath@store>` provides portable relative paths within configured stores with lazy ObjectRef access.
 For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead.
 
 ## Core DataJoint Types (Layer 2)
@@ -103,7 +107,7 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty
 AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are
 composable and can be built-in or user-defined.
 
-### `object` / `object@store` - Path-Addressed Storage
+### `<object>` / `<object@store>` - Path-Addressed Storage
 
 **Built-in AttributeType.** OAS (Object-Augmented Schema) storage:
 
@@ -119,8 +123,8 @@ class Analysis(dj.Computed):
     definition = """
     -> Recording
     ---
-    results : object          # default store
-    archive : object@cold     # specific store
+    results : <object>          # default store
+    archive : <object@cold>     # specific store
     """
 ```
 
@@ -149,7 +153,7 @@ class ObjectType(AttributeType):
         )
 ```
 
-### `content` / `content@store` - Content-Addressed Storage
+### `<content>` / `<content@store>` - Content-Addressed Storage
 
 **Built-in AttributeType.** Content-addressed storage with deduplication:
 
@@ -208,7 +212,7 @@ class ContentType(AttributeType):
 
 #### Database Column
 
-The `content` type stores JSON metadata:
+The `<content>` type stores JSON metadata:
 
 ```sql
 -- content column (MySQL)
@@ -219,7 +223,7 @@ features JSON NOT NULL
 features JSONB NOT NULL
 ```
 
-### `filepath@store` - Portable External Reference
+### `<filepath@store>` - Portable External Reference
 
 **Built-in AttributeType.** Relative path references within configured stores:
 
@@ -236,9 +240,9 @@ updating data.
 ```python
 class RawData(dj.Manual):
     definition = """
-    session_id : int
+    session_id : int32
     ---
-    recording : filepath@main      # relative path within 'main' store
+    recording : <filepath@main>      # relative path within 'main' store
     """
 
 # Insert - user provides relative path within the store
@@ -254,13 +258,13 @@ ref.download('/local/path')      # explicit download
 ref.open()                       # fsspec streaming access
 ```
 
-#### When to Use `filepath@store` vs `varchar`
+#### When to Use `<filepath@store>` vs `varchar`
 
 | Use Case | Recommended Type |
 |----------|------------------|
-| Need ObjectRef/lazy access | `filepath@store` |
-| Need portability (relative paths) | `filepath@store` |
-| Want checksum verification | `filepath@store` |
+| Need ObjectRef/lazy access | `<filepath@store>` |
+| Need portability (relative paths) | `<filepath@store>` |
+| Want checksum verification | `<filepath@store>` |
 | Just storing a URL string | `varchar` |
 | External URLs you don't control | `varchar` |
 
@@ -309,7 +313,7 @@ recording JSON NOT NULL
 recording JSONB NOT NULL
 ```
 
-#### Key Differences from Legacy `filepath@store`
+#### Key Differences from Legacy `filepath@store` (now `<filepath@store>`)
 
 | Feature | Legacy | New |
 |---------|--------|-----|
@@ -334,7 +338,7 @@ column_name JSONB NOT NULL
 ```
 
 The `json` database type:
-- Used as dtype by built-in AttributeTypes (`object`, `content`, `filepath@store`)
+- Used as dtype by built-in AttributeTypes (`<object>`, `<content>`, `<filepath@store>`)
 - Stores arbitrary JSON-serializable data
 - Automatically uses appropriate type for database backend
 - Supports JSON path queries where available
@@ -354,15 +358,15 @@ class AttributeType:
 
 **Resolution examples:**
 ```
-<xblob>       → uses content type   → default store
-<xblob@cold>  → uses content type   → cold store
-<djblob>      → dtype = "longblob"  → database (no store)
-object@cold   → uses object type    → cold store
+<xblob>        → uses <content> type   → default store
+<xblob@cold>   → uses <content> type   → cold store
+<djblob>       → dtype = "longblob"    → database (no store)
+<object@cold>  → uses <object> type    → cold store
 ```
 
 AttributeTypes can use other AttributeTypes as their dtype (composition):
-- `<xblob>` uses `content` - adds djblob serialization on top of content-addressed storage
-- `<xattach>` uses `content` - adds filename preservation on top of content-addressed storage
+- `<xblob>` uses `<content>` - adds djblob serialization on top of content-addressed storage
+- `<xattach>` uses `<content>` - adds filename preservation on top of content-addressed storage
 
 ## User-Defined AttributeTypes
 
@@ -480,17 +484,17 @@ class Attachments(dj.Manual):
 
 | Type | dtype | Storage Location | Dedup | Returns |
 |------|-------|------------------|-------|---------|
-| `object` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `object@s` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
-| `content` | `json` | `_content/{hash}` | Yes | bytes |
-| `content@s` | `json` | `_content/{hash}` | Yes | bytes |
-| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef |
+| `<object>` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `<object@s>` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef |
+| `<content>` | `json` | `_content/{hash}` | Yes | bytes |
+| `<content@s>` | `json` | `_content/{hash}` | Yes | bytes |
+| `<filepath@s>` | `json` | Configured store (relative path) | No | ObjectRef |
 | `<djblob>` | `longblob` | Database | No | Python object |
-| `<xblob>` | `content` | `_content/{hash}` | Yes | Python object |
-| `<xblob@s>` | `content@s` | `_content/{hash}` | Yes | Python object |
+| `<xblob>` | `<content>` | `_content/{hash}` | Yes | Python object |
+| `<xblob@s>` | `<content@s>` | `_content/{hash}` | Yes | Python object |
 | `<attach>` | `longblob` | Database | No | Local file path |
-| `<xattach>` | `content` | `_content/{hash}` | Yes | Local file path |
-| `<xattach@s>` | `content@s` | `_content/{hash}` | Yes | Local file path |
+| `<xattach>` | `<content>` | `_content/{hash}` | Yes | Local file path |
+| `<xattach@s>` | `<content@s>` | `_content/{hash}` | Yes | Local file path |
 
 ## Reference Counting for Content Type
 
@@ -539,8 +543,8 @@ def garbage_collect(project):
 
 ## Built-in AttributeType Comparison
 
-| Feature | `object` | `content` | `filepath@store` |
-|---------|----------|-----------|------------------|
+| Feature | `<object>` | `<content>` | `<filepath@store>` |
+|---------|------------|-------------|---------------------|
 | dtype | `json` | `json` | `json` |
 | Location | OAS store | OAS store | Configured store |
 | Addressing | Primary key | Content hash | Relative path |
@@ -552,9 +556,9 @@ def garbage_collect(project):
 | Integrity | DataJoint managed | DataJoint managed | User managed |
 
 **When to use each:**
-- **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5)
-- **`content`**: Deduplicated serialized data or file attachments via `<xblob>`, `<xattach>`
-- **`filepath@store`**: Portable references to files in configured stores
+- **`<object>`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5)
+- **`<content>`**: Deduplicated serialized data or file attachments via `<xblob>`, `<xattach>`
+- **`<filepath@store>`**: Portable references to files in configured stores
 - **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed
 
 ## Key Design Decisions
@@ -564,20 +568,21 @@ def garbage_collect(project):
    - Layer 2: Core DataJoint types (standardized, scientist-friendly)
    - Layer 3: AttributeTypes (encode/decode, composable)
 2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)`
-3. **AttributeTypes are composable**: `<xblob>` uses `content`, which uses `json`
-4. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.)
-5. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
-6. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability
-7. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
-8. **Content type**: Single-blob, content-addressed, deduplicated storage
-9. **Parameterized types**: `type@param` passes store parameter
-10. **Naming convention**:
+3. **AttributeTypes use angle brackets**: `<object>`, `<djblob>`, `<filepath@store>` - distinguishes from core types
+4. **AttributeTypes are composable**: `<xblob>` uses `<content>`, which uses `json`
+5. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.)
+6. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores
+7. **Filepath for portability**: `<filepath@store>` uses relative paths within stores for environment portability
+8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent
+9. **Content type**: Single-blob, content-addressed, deduplicated storage
+10. **Parameterized types**: `<type@param>` passes store parameter
+11. **Naming convention**:
     - `<djblob>` = internal serialized (database)
     - `<xblob>` = external serialized (content-addressed)
     - `<attach>` = internal file (single file)
     - `<xattach>` = external file (single file)
-11. **Transparent access**: AttributeTypes return Python objects or file paths
-12. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef
+12. **Transparent access**: AttributeTypes return Python objects or file paths
+13. **Lazy access**: `<object>`, `<object@store>`, and `<filepath@store>` return ObjectRef
 
 ## Migration from Legacy Types
 

From 5c1e854e64497a2d1b37b56c64e9402e82b755c8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 06:46:28 +0000
Subject: [PATCH 23/41] Add implementation plan for storage types redesign

Seven-phase implementation plan covering:
- Phase 1: Core type system foundation (type mappings, store parameters)
- Phase 2: Content-addressed storage (<content> type, ContentRegistry)
- Phase 3: User-defined AttributeTypes (<xblob>, <attach>, <xattach>, <filepath>)
- Phase 4: Insert and fetch integration (type composition)
- Phase 5: Garbage collection (project-wide GC scanner)
- Phase 6: Migration utilities (legacy external stores)
- Phase 7: Documentation and testing

Estimated effort: 24-32 days across all phases

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      | 570 ++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100644 docs/src/design/tables/storage-types-implementation-plan.md

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
new file mode 100644
index 000000000..13d2e45d3
--- /dev/null
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -0,0 +1,570 @@
+# DataJoint Storage Types Redesign - Implementation Plan
+
+## Executive Summary
+
+This plan describes the implementation of a three-layer type architecture for DataJoint, building on the existing `AttributeType` infrastructure. The key goals are:
+
+1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes)
+2. Implement content-addressed storage with deduplication
+3. Provide composable, user-friendly types (`<xblob>`, `<xattach>`, `<filepath@store>`)
+4. Enable project-wide garbage collection via `ContentRegistry`
+5. Maintain backward compatibility with existing schemas
+
+---
+
+## Phase 1: Core Type System Foundation
+
+**Goal**: Establish the complete Layer 2 core type mappings and enhance the AttributeType infrastructure.
+
+### 1.1 Expand Core Type Mappings
+
+**Files to modify:**
+- `src/datajoint/declare.py`
+
+**Current state**: `SQL_TYPE_ALIASES` already maps some types (float32, int32, etc.)
+
+**Changes needed**:
+1. Complete the type mappings as per spec:
+   ```
+   Core Type -> MySQL Type
+   int8      -> TINYINT
+   uint8     -> TINYINT UNSIGNED
+   int16     -> SMALLINT
+   ...
+   json      -> JSON
+   uuid      -> BINARY(16) or CHAR(36)
+   decimal   -> DECIMAL(p,s)
+   ```
+
+2. Add PostgreSQL mappings for future support (can be placeholder initially)
+
+**Dependencies**: None
+
+### 1.2 Enhance AttributeType with Store Parameter Support
+
+**Files to modify:**
+- `src/datajoint/attribute_type.py`
+
+**Current state**: Types don't support `@store` parameter syntax
+
+**Changes needed**:
+1. Add `store_name` property to `AttributeType`
+2. Modify `resolve_dtype()` to handle `<type@store>` syntax
+3. Add `get_type_with_store(name_with_store)` helper that parses `xblob@cold` format
+
+```python
+def parse_type_spec(spec: str) -> tuple[str, str | None]:
+    """Parse '<type@store>' or '<type>' into (type_name, store_name)."""
+    spec = spec.strip("<>")
+    if "@" in spec:
+        type_name, store_name = spec.split("@", 1)
+        return type_name, store_name
+    return spec, None
+```
+
+**Dependencies**: None
+
+### 1.3 Update Heading and Declaration Parsing
+
+**Files to modify:**
+- `src/datajoint/heading.py`
+- `src/datajoint/declare.py`
+
+**Changes needed**:
+1. Update `TYPE_PATTERN` to recognize new AttributeType patterns
+2. Store `store_name` in attribute metadata for parameterized types
+3. Update `compile_attribute()` to handle `<type@store>` syntax
+4. Update `_init_from_database()` to reconstruct store information
+
+**Dependencies**: Phase 1.2
+
+---
+
+## Phase 2: Content-Addressed Storage Implementation
+
+**Goal**: Implement the `<content>` type with content-addressed storage and deduplication.
+
+### 2.1 Create ContentRegistry Table
+
+**New file to create:**
+- `src/datajoint/content_registry.py`
+
+**Implementation**:
+```python
+class ContentRegistry:
+    """
+    Project-level content registry for content-addressed storage.
+    Stored in a designated database (e.g., `{project}_content`).
+    """
+    definition = """
+    # Content-addressed object registry (project-wide)
+    content_hash : char(64)          # SHA256 hex
+    ---
+    store        : varchar(64)       # Store name
+    size         : bigint unsigned   # Size in bytes
+    created      : timestamp DEFAULT CURRENT_TIMESTAMP
+    """
+```
+
+Key features:
+- Auto-create the registry database on first use
+- Methods: `insert_content()`, `get_content()`, `increment_ref()`, `decrement_ref()`
+- Thread-safe reference counting (if needed)
+
+**Dependencies**: None
+
+### 2.2 Implement ContentType AttributeType
+
+**Files to modify:**
+- `src/datajoint/attribute_type.py`
+
+**New built-in type**:
+```python
+class ContentType(AttributeType):
+    """Built-in AttributeType for content-addressed storage."""
+    type_name = "content"
+    dtype = "json"
+
+    def encode(self, data: bytes, *, key=None, store_name=None) -> dict:
+        """Store content, return metadata as JSON."""
+        content_hash = hashlib.sha256(data).hexdigest()
+        path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+        # Store if not exists, register in ContentRegistry
+        ...
+        return {"hash": content_hash, "store": store_name, "size": len(data)}
+
+    def decode(self, stored: dict, *, key=None) -> bytes:
+        """Retrieve content by hash."""
+        ...
+```
+
+**Dependencies**: Phase 2.1
+
+### 2.3 Implement Content Storage Backend Methods
+
+**Files to modify:**
+- `src/datajoint/storage.py`
+
+**Changes needed**:
+1. Add `put_content()` method with deduplication
+2. Add `get_content()` method with hash verification
+3. Add `compute_content_hash()` utility
+4. Add content path generation: `_content/{hash[:2]}/{hash[2:4]}/{hash}`
+
+**Dependencies**: None
+
+---
+
+## Phase 3: User-Defined AttributeTypes
+
+**Goal**: Implement the standard user-facing types that compose with `<content>` and `<object>`.
+
+### 3.1 Implement XBlobType (External Blob)
+
+**Files to modify:**
+- `src/datajoint/attribute_type.py`
+
+```python
+@register_type
+class XBlobType(AttributeType):
+    """External serialized blob using content-addressed storage."""
+    type_name = "xblob"
+    dtype = "<content>"  # Composition: uses ContentType
+
+    def encode(self, value, *, key=None) -> bytes:
+        from . import blob
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored, *, key=None) -> Any:
+        from . import blob
+        return blob.unpack(stored)
+```
+
+**Key behavior**: Serializes to djblob format, stores via content-addressed storage
+
+**Dependencies**: Phase 2.2
+
+### 3.2 Implement AttachType and XAttachType
+
+**Files to modify:**
+- `src/datajoint/attribute_type.py`
+
+```python
+@register_type
+class AttachType(AttributeType):
+    """Internal file attachment stored in database."""
+    type_name = "attach"
+    dtype = "longblob"
+
+    def encode(self, filepath, *, key=None) -> bytes:
+        path = Path(filepath)
+        return path.name.encode() + b"\0" + path.read_bytes()
+
+    def decode(self, stored, *, key=None) -> str:
+        filename, contents = stored.split(b"\0", 1)
+        # Write to download_path and return path
+        ...
+
+@register_type
+class XAttachType(AttributeType):
+    """External file attachment using content-addressed storage."""
+    type_name = "xattach"
+    dtype = "<content>"
+
+    def encode(self, filepath, *, key=None) -> bytes:
+        path = Path(filepath)
+        return path.name.encode() + b"\0" + path.read_bytes()
+
+    def decode(self, stored, *, key=None) -> str:
+        # Same as AttachType.decode()
+        ...
+```
+
+**Dependencies**: Phase 2.2
+
+### 3.3 Implement FilepathType
+
+**Files to modify:**
+- `src/datajoint/attribute_type.py`
+
+```python
+@register_type
+class FilepathType(AttributeType):
+    """Portable relative path reference within configured stores."""
+    type_name = "filepath"
+    dtype = "json"
+
+    def encode(self, relative_path: str, *, key=None, store_name=None,
+               compute_checksum: bool = False) -> dict:
+        """Register reference to file in store."""
+        store = get_store(store_name)  # Required for filepath
+        metadata = {'path': relative_path, 'store': store_name}
+        if compute_checksum:
+            # Compute checksum and size
+            ...
+        return metadata
+
+    def decode(self, stored: dict, *, key=None) -> ObjectRef:
+        """Return ObjectRef for lazy access."""
+        return ObjectRef(
+            store=get_store(stored['store']),
+            path=stored['path'],
+            checksum=stored.get('checksum')
+        )
+```
+
+**Key difference from legacy**: Returns `ObjectRef` instead of copying to local stage
+
+**Dependencies**: Existing `ObjectRef` and `StorageBackend`
+
+---
+
+## Phase 4: Insert and Fetch Integration
+
+**Goal**: Update the data path to handle the new type system seamlessly.
+
+### 4.1 Update Insert Processing
+
+**Files to modify:**
+- `src/datajoint/table.py`
+
+**Changes needed in `__make_placeholder()`**:
+1. Handle type composition (resolve full type chain)
+2. Pass `store_name` to `encode()` when applicable
+3. Handle `<content>` type's special behavior
+4. Process `<filepath@store>` with store parameter
+
+```python
+def __make_placeholder(self, name, value, ...):
+    attr = self.heading[name]
+    if attr.adapter:
+        # Resolve type chain and pass store_name
+        final_dtype, type_chain = resolve_dtype(attr.adapter.dtype)
+        store_name = attr.store
+
+        # Apply type chain: outer -> inner
+        for attr_type in type_chain:
+            value = attr_type.encode(value, key=key, store_name=store_name)
+
+        # Continue with final_dtype processing
+        ...
+```
+
+**Dependencies**: Phases 1-3
+
+### 4.2 Update Fetch Processing
+
+**Files to modify:**
+- `src/datajoint/fetch.py`
+
+**Changes needed in `_get()`**:
+1. Handle `<content>` type: retrieve from content store
+2. Handle type composition: apply decoders in reverse order
+3. Handle `<filepath@store>`: return `ObjectRef` instead of downloading
+
+```python
+def _get(connection, attr, data, squeeze, download_path):
+    if attr.adapter:
+        final_dtype, type_chain = resolve_dtype(attr.adapter.dtype)
+
+        # Process based on final_dtype
+        if final_dtype == "json":
+            data = json.loads(data)
+        elif final_dtype == "longblob":
+            # Handle content retrieval if needed
+            ...
+
+        # Apply type chain in reverse: inner -> outer
+        for attr_type in reversed(type_chain):
+            data = attr_type.decode(data, key=key)
+
+        return data
+```
+
+**Dependencies**: Phases 1-3
+
+### 4.3 Update Heading Attribute Properties
+
+**Files to modify:**
+- `src/datajoint/heading.py`
+
+**Changes needed**:
+1. Add `is_content` property for content-addressed attributes
+2. Update property detection logic for new types
+3. Store composed type information for fetch/insert
+
+**Dependencies**: Phase 1.3
+
+---
+
+## Phase 5: Garbage Collection
+
+**Goal**: Implement project-wide garbage collection for content-addressed storage.
+
+### 5.1 Implement GC Scanner
+
+**New file to create:**
+- `src/datajoint/gc.py`
+
+```python
+def scan_content_references(project) -> set[tuple[str, str]]:
+    """
+    Scan all schemas in project for content references.
+
+    Returns:
+        Set of (content_hash, store) tuples that are referenced
+    """
+    referenced = set()
+    for schema in project.schemas:
+        for table in schema.tables:
+            for attr in table.heading.attributes:
+                if attr.type in ('content', 'xblob', 'xattach'):
+                    hashes = table.fetch(attr.name)
+                    for h in hashes:
+                        if isinstance(h, dict):
+                            referenced.add((h['hash'], h.get('store')))
+    return referenced
+
+def garbage_collect(project, dry_run=True) -> dict:
+    """
+    Remove unreferenced content from storage.
+
+    Returns:
+        Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B}
+    """
+    ...
+```
+
+**Dependencies**: Phase 2.1
+
+### 5.2 Add GC CLI Commands
+
+**Files to modify:**
+- CLI or management interface
+
+**New commands**:
+- `dj gc scan` - Scan and report orphaned content
+- `dj gc clean` - Remove orphaned content
+- `dj gc status` - Show content registry status
+
+**Dependencies**: Phase 5.1
+
+---
+
+## Phase 6: Migration Utilities
+
+**Goal**: Provide tools to migrate existing schemas to the new type system.
+
+### 6.1 Enhance Migration Module
+
+**Files to modify:**
+- `src/datajoint/migrate.py`
+
+**New functions**:
+
+```python
+def analyze_external_stores(schema) -> list[dict]:
+    """Analyze legacy ~external_* tables for migration."""
+    ...
+
+def migrate_external_to_content(schema, store_name, dry_run=True) -> dict:
+    """
+    Migrate legacy ~external_{store} to new ContentRegistry.
+
+    Steps:
+    1. Read entries from ~external_{store}
+    2. For each entry: fetch content, compute SHA256
+    3. Copy to _content/{hash}/ if not exists
+    4. Update referencing tables (UUID -> hash JSON)
+    5. Register in ContentRegistry
+    """
+    ...
+
+def migrate_blob_to_djblob(schema, dry_run=True) -> dict:
+    """Update implicit blob columns to use <djblob>."""
+    ...
+
+def migrate_filepath_to_new(schema, dry_run=True) -> dict:
+    """
+    Migrate legacy filepath@store to new <filepath@store>.
+
+    Changes:
+    - UUID column -> JSON column
+    - Copy-based access -> ObjectRef-based access
+    """
+    ...
+```
+
+### 6.2 Create Migration CLI
+
+**New commands**:
+- `dj migrate analyze <schema>` - Analyze migration needs
+- `dj migrate external <schema> <store>` - Migrate external store
+- `dj migrate blobs <schema>` - Migrate blob columns
+- `dj migrate status <schema>` - Show migration status
+
+**Dependencies**: Phase 6.1
+
+---
+
+## Phase 7: Documentation and Testing
+
+### 7.1 Unit Tests
+
+**New test files:**
+- `tests/test_content_type.py` - Content-addressed storage tests
+- `tests/test_xblob.py` - XBlob type tests
+- `tests/test_attach_types.py` - Attachment type tests
+- `tests/test_filepath_new.py` - New filepath tests
+- `tests/test_gc.py` - Garbage collection tests
+- `tests/test_migration.py` - Migration utility tests
+
+**Existing test files to update:**
+- `tests/test_attribute_type.py` - Add new type tests
+- `tests/test_object.py` - Verify object type unchanged
+
+### 7.2 Integration Tests
+
+**Test scenarios**:
+1. Insert/fetch roundtrip for all new types
+2. Type composition (xblob using content)
+3. Multi-schema content deduplication
+4. GC with cross-schema references
+5. Migration from legacy external stores
+6. Backward compatibility with existing schemas
+
+### 7.3 Documentation
+
+**Files to update:**
+- `docs/src/design/tables/storage-types-spec.md` - Already exists
+- Create user guide for new types
+- Create migration guide
+- Update API reference
+
+---
+
+## Implementation Order and Dependencies
+
+```
+Phase 1: Core Type System Foundation
+├── 1.1 Expand Core Type Mappings (no deps)
+├── 1.2 Enhance AttributeType with Store Parameter (no deps)
+└── 1.3 Update Heading and Declaration Parsing (depends on 1.2)
+
+Phase 2: Content-Addressed Storage
+├── 2.1 Create ContentRegistry Table (no deps)
+├── 2.2 Implement ContentType (depends on 2.1)
+└── 2.3 Content Storage Backend Methods (no deps)
+
+Phase 3: User-Defined AttributeTypes (depends on Phase 2)
+├── 3.1 Implement XBlobType (depends on 2.2)
+├── 3.2 Implement AttachType and XAttachType (depends on 2.2)
+└── 3.3 Implement FilepathType (no deps)
+
+Phase 4: Insert and Fetch Integration (depends on Phases 1-3)
+├── 4.1 Update Insert Processing
+├── 4.2 Update Fetch Processing
+└── 4.3 Update Heading Attribute Properties
+
+Phase 5: Garbage Collection (depends on Phase 2)
+├── 5.1 Implement GC Scanner
+└── 5.2 Add GC CLI Commands
+
+Phase 6: Migration Utilities (depends on Phases 2-4)
+├── 6.1 Enhance Migration Module
+└── 6.2 Create Migration CLI
+
+Phase 7: Documentation and Testing (ongoing)
+```
+
+---
+
+## Critical Files Summary
+
+| File | Changes |
+|------|---------|
+| `src/datajoint/attribute_type.py` | All new AttributeTypes: `ContentType`, `XBlobType`, `AttachType`, `XAttachType`, `FilepathType` |
+| `src/datajoint/declare.py` | Type pattern parsing, SQL generation, `<type@store>` syntax |
+| `src/datajoint/heading.py` | Attribute metadata, composed type information |
+| `src/datajoint/table.py` | Insert logic with type composition |
+| `src/datajoint/fetch.py` | Fetch logic with type chain decoding |
+| `src/datajoint/content_registry.py` | **New**: ContentRegistry table and methods |
+| `src/datajoint/gc.py` | **New**: Garbage collection scanner |
+| `src/datajoint/migrate.py` | Migration utilities |
+
+---
+
+## Risk Mitigation
+
+### Backward Compatibility
+1. All existing types (`longblob`, `blob@store`, `attach@store`, `filepath@store`) continue to work
+2. Legacy `~external_*` tables remain functional during transition
+3. Implicit blob serialization preserved for existing schemas
+4. Migration is opt-in and reversible
+
+### Performance Considerations
+1. Content hashing uses SHA256 (fast, widely supported)
+2. Deduplication reduces storage costs
+3. Lazy ObjectRef prevents unnecessary I/O
+4. GC runs on-demand, not automatically
+
+### Error Handling
+1. Content hash verification on fetch (optional)
+2. Graceful handling of missing content
+3. Transaction safety for multi-table operations
+4. Clear error messages for misconfiguration
+
+---
+
+## Estimated Effort
+
+| Phase | Estimated Days | Risk |
+|-------|----------------|------|
+| Phase 1 | 3-4 days | Low |
+| Phase 2 | 4-5 days | Medium |
+| Phase 3 | 3-4 days | Low |
+| Phase 4 | 4-5 days | Medium |
+| Phase 5 | 2-3 days | Low |
+| Phase 6 | 3-4 days | Medium |
+| Phase 7 | 5-7 days | Low |
+| **Total** | **24-32 days** | |

From 979f45bcc606361134ce45a42f4304cc71e4fbea Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 18:58:34 +0000
Subject: [PATCH 24/41] Implement Phase 1: Core type system with store
 parameter support

Phase 1.1 - Core type mappings already complete in declare.py

Phase 1.2 - Enhanced AttributeType with store parameter support:
- Added parse_type_spec() to parse "<type@store>" into (type_name, store_name)
- Updated get_type() to handle parameterized types
- Updated is_type_registered() to ignore store parameters
- Updated resolve_dtype() to propagate store through type chains
  - Returns (final_dtype, type_chain, store_name) tuple
  - Store from outer type overrides inner type's store

Phase 1.3 - Updated heading and declaration parsing:
- Updated get_adapter() to return (adapter, store_name) tuple
- Updated substitute_special_type() to capture store from ADAPTED types
- Store parameter is now properly passed through type resolution

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/attribute_adapter.py | 28 ++++-----
 src/datajoint/attribute_type.py    | 92 +++++++++++++++++++++++-------
 src/datajoint/declare.py           |  5 +-
 3 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
index 7df566a58..cf00b8e4b 100644
--- a/src/datajoint/attribute_adapter.py
+++ b/src/datajoint/attribute_adapter.py
@@ -12,7 +12,7 @@
 import warnings
 from typing import Any
 
-from .attribute_type import AttributeType, get_type, is_type_registered
+from .attribute_type import AttributeType, get_type, is_type_registered, parse_type_spec
 from .errors import DataJointError
 
 # Pattern to detect blob types for internal pack/unpack
@@ -154,7 +154,7 @@ def get(self, value: Any) -> Any:
         raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()")
 
 
-def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
+def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]:
     """
     Get an attribute type/adapter by name.
 
@@ -165,47 +165,49 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType:
     Args:
         context: Schema context dictionary (for legacy adapters).
         adapter_name: The adapter/type name, with or without angle brackets.
+                      May include store parameter (e.g., "<xblob@cold>").
 
     Returns:
-        The AttributeType instance.
+        Tuple of (AttributeType instance, store_name or None).
 
     Raises:
         DataJointError: If the adapter is not found or invalid.
     """
-    adapter_name = adapter_name.lstrip("<").rstrip(">")
+    # Parse type name and optional store parameter
+    type_name, store_name = parse_type_spec(adapter_name)
 
     # First, check the global type registry (new system)
-    if is_type_registered(adapter_name):
-        return get_type(adapter_name)
+    if is_type_registered(type_name):
+        return get_type(type_name), store_name
 
     # Fall back to context-based lookup (legacy system)
     if context is None:
         raise DataJointError(
-            f"Attribute type <{adapter_name}> is not registered. " "Use @dj.register_type to register custom types."
+            f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types."
         )
 
     try:
-        adapter = context[adapter_name]
+        adapter = context[type_name]
     except KeyError:
         raise DataJointError(
-            f"Attribute type <{adapter_name}> is not defined. "
+            f"Attribute type <{type_name}> is not defined. "
             "Register it with @dj.register_type or include it in the schema context."
         )
 
     # Validate it's an AttributeType (or legacy AttributeAdapter)
     if not isinstance(adapter, AttributeType):
         raise DataJointError(
-            f"Attribute adapter '{adapter_name}' must be an instance of "
+            f"Attribute adapter '{type_name}' must be an instance of "
             "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)"
         )
 
     # For legacy adapters from context, store the name they were looked up by
     if isinstance(adapter, AttributeAdapter):
-        adapter._type_name = adapter_name
+        adapter._type_name = type_name
 
     # Validate the dtype/attribute_type
     dtype = adapter.dtype
     if not isinstance(dtype, str) or not re.match(r"^\w", dtype):
-        raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>")
+        raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{type_name}>")
 
-    return adapter
+    return adapter, store_name
diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index 9be2d2214..97ca54646 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -242,6 +242,32 @@ class GraphType(dj.AttributeType):
     return cls
 
 
+def parse_type_spec(spec: str) -> tuple[str, str | None]:
+    """
+    Parse a type specification into type name and optional store parameter.
+
+    Handles formats like:
+    - "<xblob>" -> ("xblob", None)
+    - "<xblob@cold>" -> ("xblob", "cold")
+    - "xblob@cold" -> ("xblob", "cold")
+    - "xblob" -> ("xblob", None)
+
+    Args:
+        spec: Type specification string, with or without angle brackets.
+
+    Returns:
+        Tuple of (type_name, store_name). store_name is None if not specified.
+    """
+    # Strip angle brackets
+    spec = spec.strip("<>").strip()
+
+    if "@" in spec:
+        type_name, store_name = spec.split("@", 1)
+        return type_name.strip(), store_name.strip()
+
+    return spec, None
+
+
 def unregister_type(name: str) -> None:
     """
     Remove a type from the registry.
@@ -269,6 +295,7 @@ def get_type(name: str) -> AttributeType:
 
     Args:
         name: The type name, with or without angle brackets.
+              Store parameters (e.g., "<xblob@cold>") are stripped.
 
     Returns:
         The registered AttributeType instance.
@@ -276,20 +303,22 @@ def get_type(name: str) -> AttributeType:
     Raises:
         DataJointError: If the type is not found.
     """
-    name = name.strip("<>")
+    # Strip angle brackets and store parameter
+    type_name, _ = parse_type_spec(name)
 
     # Check explicit registry first
-    if name in _type_registry:
-        return _type_registry[name]
+    if type_name in _type_registry:
+        return _type_registry[type_name]
 
     # Lazy-load entry points
     _load_entry_points()
 
-    if name in _type_registry:
-        return _type_registry[name]
+    if type_name in _type_registry:
+        return _type_registry[type_name]
 
     raise DataJointError(
-        f"Unknown attribute type: <{name}>. " f"Ensure the type is registered via @dj.register_type or installed as a package."
+        f"Unknown attribute type: <{type_name}>. "
+        f"Ensure the type is registered via @dj.register_type or installed as a package."
     )
 
 
@@ -309,16 +338,16 @@ def is_type_registered(name: str) -> bool:
     Check if a type name is registered.
 
     Args:
-        name: The type name to check.
+        name: The type name to check (store parameters are ignored).
 
     Returns:
         True if the type is registered.
     """
-    name = name.strip("<>")
-    if name in _type_registry:
+    type_name, _ = parse_type_spec(name)
+    if type_name in _type_registry:
         return True
     _load_entry_points()
-    return name in _type_registry
+    return type_name in _type_registry
 
 
 def _load_entry_points() -> None:
@@ -368,23 +397,37 @@ def _load_entry_points() -> None:
             logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}")
 
 
-def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[AttributeType]]:
+def resolve_dtype(
+    dtype: str, seen: set[str] | None = None, store_name: str | None = None
+) -> tuple[str, list[AttributeType], str | None]:
     """
     Resolve a dtype string, following type chains.
 
     If dtype references another custom type (e.g., "<other_type>"), recursively
-    resolves to find the ultimate storage type.
+    resolves to find the ultimate storage type. Store parameters are propagated
+    through the chain.
 
     Args:
-        dtype: The dtype string to resolve.
+        dtype: The dtype string to resolve (e.g., "<xblob>", "<xblob@cold>", "longblob").
         seen: Set of already-seen type names (for cycle detection).
+        store_name: Store name from outer type specification (propagated inward).
 
     Returns:
-        Tuple of (final_storage_type, list_of_types_in_chain).
+        Tuple of (final_storage_type, list_of_types_in_chain, resolved_store_name).
         The chain is ordered from outermost to innermost type.
 
     Raises:
         DataJointError: If a circular type reference is detected.
+
+    Examples:
+        >>> resolve_dtype("<xblob>")
+        ("json", [XBlobType, ContentType], None)
+
+        >>> resolve_dtype("<xblob@cold>")
+        ("json", [XBlobType, ContentType], "cold")
+
+        >>> resolve_dtype("longblob")
+        ("longblob", [], None)
     """
     if seen is None:
         seen = set()
@@ -393,7 +436,10 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A
 
     # Check if dtype is a custom type reference
     if dtype.startswith("<") and dtype.endswith(">"):
-        type_name = dtype[1:-1]
+        type_name, dtype_store = parse_type_spec(dtype)
+
+        # Store from this level overrides inherited store
+        effective_store = dtype_store if dtype_store is not None else store_name
 
         if type_name in seen:
             raise DataJointError(f"Circular type reference detected: <{type_name}>")
@@ -402,13 +448,19 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A
         attr_type = get_type(type_name)
         chain.append(attr_type)
 
-        # Recursively resolve the inner dtype
-        inner_dtype, inner_chain = resolve_dtype(attr_type.dtype, seen)
+        # Recursively resolve the inner dtype, propagating store
+        inner_dtype, inner_chain, resolved_store = resolve_dtype(attr_type.dtype, seen, effective_store)
         chain.extend(inner_chain)
-        return inner_dtype, chain
+        return inner_dtype, chain, resolved_store
+
+    # Not a custom type - check if it has a store suffix (e.g., "blob@store")
+    if "@" in dtype:
+        base_type, dtype_store = dtype.split("@", 1)
+        effective_store = dtype_store if dtype_store else store_name
+        return base_type, chain, effective_store
 
-    # Not a custom type - return as-is
-    return dtype, chain
+    # Plain type - return as-is with propagated store
+    return dtype, chain, store_name
 
 
 # =============================================================================
diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index 397da108b..8aabddc00 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -489,7 +489,10 @@ def substitute_special_type(match, category, foreign_key_sql, context):
             "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match)
         )
     elif category == "ADAPTED":
-        attr_type = get_adapter(context, match["type"])
+        attr_type, store_name = get_adapter(context, match["type"])
+        # Store the store parameter if present
+        if store_name is not None:
+            match["store"] = store_name
         match["type"] = attr_type.dtype
         category = match_type(match["type"])
         if category in SPECIAL_TYPES:

From 6926c5871a9be749b8bcf06eb2616c34d084844d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 20:08:48 +0000
Subject: [PATCH 25/41] Remove legacy AttributeAdapter support, update tests
 for AttributeType

- Remove AttributeAdapter class and context-based lookup from attribute_adapter.py
- Simplify attribute_adapter.py to compatibility shim that re-exports from attribute_type
- Remove AttributeAdapter from package exports in __init__.py
- Update tests/schema_adapted.py to use @dj.register_type decorator
- Update tests/test_adapted_attributes.py to work with globally registered types
- Remove test_attribute_adapter_deprecated test from test_attribute_type.py

Types are now registered globally via @dj.register_type decorator, eliminating
the need for context-based adapter lookup.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/__init__.py          |   2 -
 src/datajoint/attribute_adapter.py | 203 +++--------------------------
 tests/schema_adapted.py            |  42 +++---
 tests/test_adapted_attributes.py   |  29 +----
 tests/test_attribute_type.py       |   7 -
 5 files changed, 44 insertions(+), 239 deletions(-)

diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py
index 405134630..a19aae6d0 100644
--- a/src/datajoint/__init__.py
+++ b/src/datajoint/__init__.py
@@ -48,7 +48,6 @@
     "AttributeType",
     "register_type",
     "list_types",
-    "AttributeAdapter",  # Deprecated, use AttributeType
     "errors",
     "migrate",
     "DataJointError",
@@ -62,7 +61,6 @@
 from . import errors
 from . import migrate
 from .admin import kill
-from .attribute_adapter import AttributeAdapter
 from .attribute_type import AttributeType, list_types, register_type
 from .blob import MatCell, MatStruct
 from .cli import cli
diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
index cf00b8e4b..c92618f9e 100644
--- a/src/datajoint/attribute_adapter.py
+++ b/src/datajoint/attribute_adapter.py
@@ -1,213 +1,42 @@
 """
-Legacy attribute adapter module.
+Attribute adapter module - compatibility shim.
 
-This module provides backward compatibility for the deprecated AttributeAdapter class.
-New code should use :class:`datajoint.AttributeType` instead.
+This module re-exports functions from attribute_type for backward compatibility
+with code that imports from attribute_adapter.
 
 .. deprecated:: 0.15
-    Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods.
+    Import directly from :mod:`datajoint.attribute_type` instead.
 """
 
-import re
-import warnings
-from typing import Any
-
-from .attribute_type import AttributeType, get_type, is_type_registered, parse_type_spec
+from .attribute_type import (
+    AttributeType,
+    get_type,
+    is_type_registered,
+    parse_type_spec,
+)
 from .errors import DataJointError
 
-# Pattern to detect blob types for internal pack/unpack
-_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I)
-
-
-class AttributeAdapter(AttributeType):
-    """
-    Legacy base class for attribute adapters.
-
-    .. deprecated:: 0.15
-        Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods instead.
-
-    This class provides backward compatibility for existing adapters that use
-    the ``attribute_type``, ``put()``, and ``get()`` API.
-
-    Migration guide::
-
-        # Old style (deprecated):
-        class GraphAdapter(dj.AttributeAdapter):
-            attribute_type = "longblob"
-
-            def put(self, graph):
-                return list(graph.edges)
-
-            def get(self, edges):
-                return nx.Graph(edges)
-
-        # New style (recommended):
-        @dj.register_type
-        class GraphType(dj.AttributeType):
-            type_name = "graph"
-            dtype = "longblob"
-
-            def encode(self, graph, *, key=None):
-                return list(graph.edges)
-
-            def decode(self, edges, *, key=None):
-                return nx.Graph(edges)
-    """
-
-    # Subclasses can set this as a class attribute instead of property
-    attribute_type: str = None  # type: ignore
-
-    def __init__(self):
-        # Emit deprecation warning on instantiation
-        warnings.warn(
-            f"{self.__class__.__name__} uses the deprecated AttributeAdapter API. "
-            "Migrate to AttributeType with encode/decode methods.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
-    @property
-    def type_name(self) -> str:
-        """
-        Infer type name from class name for legacy adapters.
-
-        Legacy adapters were identified by their variable name in the context dict,
-        not by a property. For backward compatibility, we use the lowercase class name.
-        """
-        # Check if a _type_name was explicitly set (for context-based lookup)
-        if hasattr(self, "_type_name"):
-            return self._type_name
-        # Fall back to class name
-        return self.__class__.__name__.lower()
-
-    @property
-    def dtype(self) -> str:
-        """Map legacy attribute_type to new dtype property."""
-        attr_type = self.attribute_type
-        if attr_type is None:
-            raise NotImplementedError(
-                f"{self.__class__.__name__} must define 'attribute_type' " "(or migrate to AttributeType with 'dtype')"
-            )
-        return attr_type
-
-    def _is_blob_dtype(self) -> bool:
-        """Check if dtype is a blob type requiring pack/unpack."""
-        return bool(_BLOB_PATTERN.match(self.dtype))
-
-    def encode(self, value: Any, *, key: dict | None = None) -> Any:
-        """
-        Delegate to legacy put() method, with blob packing if needed.
-
-        Legacy adapters expect blob.pack to be called after put() when
-        the dtype is a blob type. This wrapper handles that automatically.
-        """
-        result = self.put(value)
-        # Legacy adapters expect blob.pack after put() for blob dtypes
-        if self._is_blob_dtype():
-            from . import blob
-
-            result = blob.pack(result)
-        return result
-
-    def decode(self, stored: Any, *, key: dict | None = None) -> Any:
-        """
-        Delegate to legacy get() method, with blob unpacking if needed.
-
-        Legacy adapters expect blob.unpack to be called before get() when
-        the dtype is a blob type. This wrapper handles that automatically.
-        """
-        # Legacy adapters expect blob.unpack before get() for blob dtypes
-        if self._is_blob_dtype():
-            from . import blob
-
-            stored = blob.unpack(stored)
-        return self.get(stored)
-
-    def put(self, obj: Any) -> Any:
-        """
-        Convert an object of the adapted type into a storable value.
-
-        .. deprecated:: 0.15
-            Override ``encode()`` instead.
-
-        Args:
-            obj: An object of the adapted type.
-
-        Returns:
-            Value to store in the database.
-        """
-        raise NotImplementedError(f"{self.__class__.__name__} must implement put() or migrate to encode()")
-
-    def get(self, value: Any) -> Any:
-        """
-        Convert a value from the database into the adapted type.
-
-        .. deprecated:: 0.15
-            Override ``decode()`` instead.
-
-        Args:
-            value: Value from the database.
-
-        Returns:
-            Object of the adapted type.
-        """
-        raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()")
-
 
 def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]:
     """
-    Get an attribute type/adapter by name.
-
-    This function provides backward compatibility by checking both:
-    1. The global type registry (new system)
-    2. The schema context dict (legacy system)
+    Get an attribute type by name.
 
     Args:
-        context: Schema context dictionary (for legacy adapters).
-        adapter_name: The adapter/type name, with or without angle brackets.
+        context: Ignored (legacy parameter, kept for API compatibility).
+        adapter_name: The type name, with or without angle brackets.
                       May include store parameter (e.g., "<xblob@cold>").
 
     Returns:
         Tuple of (AttributeType instance, store_name or None).
 
     Raises:
-        DataJointError: If the adapter is not found or invalid.
+        DataJointError: If the type is not found.
     """
     # Parse type name and optional store parameter
     type_name, store_name = parse_type_spec(adapter_name)
 
-    # First, check the global type registry (new system)
+    # Look up in the global type registry
     if is_type_registered(type_name):
         return get_type(type_name), store_name
 
-    # Fall back to context-based lookup (legacy system)
-    if context is None:
-        raise DataJointError(
-            f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types."
-        )
-
-    try:
-        adapter = context[type_name]
-    except KeyError:
-        raise DataJointError(
-            f"Attribute type <{type_name}> is not defined. "
-            "Register it with @dj.register_type or include it in the schema context."
-        )
-
-    # Validate it's an AttributeType (or legacy AttributeAdapter)
-    if not isinstance(adapter, AttributeType):
-        raise DataJointError(
-            f"Attribute adapter '{type_name}' must be an instance of "
-            "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)"
-        )
-
-    # For legacy adapters from context, store the name they were looked up by
-    if isinstance(adapter, AttributeAdapter):
-        adapter._type_name = type_name
-
-    # Validate the dtype/attribute_type
-    dtype = adapter.dtype
-    if not isinstance(dtype, str) or not re.match(r"^\w", dtype):
-        raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{type_name}>")
-
-    return adapter, store_name
+    raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.")
diff --git a/tests/schema_adapted.py b/tests/schema_adapted.py
index c7b5830c0..321edfc7b 100644
--- a/tests/schema_adapted.py
+++ b/tests/schema_adapted.py
@@ -7,40 +7,42 @@
 import datajoint as dj
 
 
-class GraphAdapter(dj.AttributeAdapter):
-    attribute_type = "longblob"  # this is how the attribute will be declared
+@dj.register_type
+class GraphType(dj.AttributeType):
+    """Custom type for storing NetworkX graphs as edge lists."""
 
-    @staticmethod
-    def get(obj):
-        # convert edge list into a graph
-        return nx.Graph(obj)
+    type_name = "graph"
+    dtype = "longblob"
 
-    @staticmethod
-    def put(obj):
-        # convert graph object into an edge list
+    def encode(self, obj, *, key=None):
+        """Convert graph object into an edge list."""
         assert isinstance(obj, nx.Graph)
         return list(obj.edges)
 
+    def decode(self, stored, *, key=None):
+        """Convert edge list into a graph."""
+        return nx.Graph(stored)
 
-class LayoutToFilepath(dj.AttributeAdapter):
-    """
-    An adapted data type that saves a graph layout into fixed filepath
-    """
 
-    attribute_type = "filepath@repo-s3"
+@dj.register_type
+class LayoutToFilepathType(dj.AttributeType):
+    """Custom type that saves a graph layout to a filepath."""
 
-    @staticmethod
-    def get(path):
-        with open(path, "r") as f:
-            return json.load(f)
+    type_name = "layout_to_filepath"
+    dtype = "filepath@repo-s3"
 
-    @staticmethod
-    def put(layout):
+    def encode(self, layout, *, key=None):
+        """Save layout to file and return path."""
         path = Path(dj.config["stores"]["repo-s3"]["stage"], "layout.json")
         with open(str(path), "w") as f:
             json.dump(layout, f)
         return path
 
+    def decode(self, path, *, key=None):
+        """Load layout from file."""
+        with open(path, "r") as f:
+            return json.load(f)
+
 
 class Connectivity(dj.Manual):
     definition = """
diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py
index 0b4285ffb..eb5cd760d 100644
--- a/tests/test_adapted_attributes.py
+++ b/tests/test_adapted_attributes.py
@@ -1,10 +1,9 @@
 """
 Tests for adapted/custom attribute types.
 
-These tests use the legacy AttributeAdapter API for backward compatibility testing.
+These tests verify the AttributeType system for custom data types.
 """
 
-import warnings
 from itertools import zip_longest
 
 import networkx as nx
@@ -15,40 +14,23 @@
 from . import schema_adapted
 from .schema_adapted import Connectivity, Layout
 
-# Filter deprecation warnings from legacy AttributeAdapter usage in these tests
-pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
-
 
 @pytest.fixture
 def schema_name(prefix):
     return prefix + "_test_custom_datatype"
 
 
-@pytest.fixture
-def adapted_graph_instance():
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
-        yield schema_adapted.GraphAdapter()
-
-
 @pytest.fixture
 def schema_ad(
     connection_test,
-    adapted_graph_instance,
     enable_filepath_feature,
     s3_creds,
     tmpdir,
     schema_name,
 ):
     dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))}
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
-        layout_adapter = schema_adapted.LayoutToFilepath()
-    context = {
-        **schema_adapted.LOCALS_ADAPTED,
-        "graph": adapted_graph_instance,
-        "layout_to_filepath": layout_adapter,
-    }
+    # Types are registered globally via @dj.register_type decorator in schema_adapted
+    context = {**schema_adapted.LOCALS_ADAPTED}
     schema = dj.schema(schema_name, context=context, connection=connection_test)
     schema(schema_adapted.Connectivity)
     schema(schema_adapted.Layout)
@@ -66,9 +48,10 @@ def local_schema(schema_ad, schema_name):
 
 
 @pytest.fixture
-def schema_virtual_module(schema_ad, adapted_graph_instance, schema_name):
+def schema_virtual_module(schema_ad, schema_name):
     """Fixture for testing virtual modules"""
-    schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, add_objects={"graph": adapted_graph_instance})
+    # Types are registered globally, no need to add_objects for adapters
+    schema_virtual_module = dj.VirtualModule("virtual_module", schema_name)
     return schema_virtual_module
 
 
diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py
index f8f822a60..e9220bfd4 100644
--- a/tests/test_attribute_type.py
+++ b/tests/test_attribute_type.py
@@ -340,12 +340,6 @@ def test_exports_from_datajoint(self):
         assert hasattr(dj, "register_type")
         assert hasattr(dj, "list_types")
 
-    def test_attribute_adapter_deprecated(self):
-        """Test that AttributeAdapter is still available but deprecated."""
-        assert hasattr(dj, "AttributeAdapter")
-        # AttributeAdapter should be a subclass of AttributeType
-        assert issubclass(dj.AttributeAdapter, dj.AttributeType)
-
 
 class TestDJBlobType:
     """Tests for the built-in DJBlobType."""
@@ -405,7 +399,6 @@ def test_djblob_handles_serialization(self):
         With the new design:
         - Plain longblob columns store/return raw bytes (no serialization)
         - <djblob> handles pack/unpack in encode/decode
-        - Legacy AttributeAdapter handles pack/unpack internally for backward compat
         """
         blob_type = get_type("djblob")
 

From 97bc16260cf701a90ebc4df1e26f175ca8d58ed4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 20:29:02 +0000
Subject: [PATCH 26/41] Simplify core type system: remove SERIALIZED_TYPES,
 clarify blob semantics

Core types (uuid, json, blob) now map directly to native database types
without any implicit serialization. Serialization is handled by AttributeTypes
like <djblob> via encode()/decode() methods.

Changes:
- Rename SERIALIZED_TYPES to BINARY_TYPES in declare.py (clearer naming)
- Update check for default values in compile_attribute()
- Clarify in spec that core blob types store raw bytes

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md |  7 +++++--
 src/datajoint/declare.py                     | 11 ++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index 3d70c908e..a962ee6c8 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -89,10 +89,13 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty
 
 ### Binary Types
 
+Core binary types store raw bytes without any serialization. Use `<djblob>` AttributeType
+for serialized Python objects.
+
 | Core Type | Description | MySQL | PostgreSQL |
 |-----------|-------------|-------|------------|
-| `blob` | Binary up to 64KB | `BLOB` | `BYTEA` |
-| `longblob` | Binary up to 4GB | `LONGBLOB` | `BYTEA` |
+| `blob` | Raw bytes up to 64KB | `BLOB` | `BYTEA` |
+| `longblob` | Raw bytes up to 4GB | `LONGBLOB` | `BYTEA` |
 
 ### Special Types
 
diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index 8aabddc00..df89dede2 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -87,14 +87,15 @@
     "EXTERNAL_BLOB",
     "FILEPATH",
 }  # data referenced by a UUID in external tables
-SERIALIZED_TYPES = {
+# Blob and attachment types cannot have SQL default values (other than NULL)
+BINARY_TYPES = {
     "EXTERNAL_ATTACH",
     "INTERNAL_ATTACH",
     "EXTERNAL_BLOB",
     "INTERNAL_BLOB",
-}  # requires packing data
+}
 
-assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, SERIALIZED_TYPES) <= set(TYPE_PATTERN)
+assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, BINARY_TYPES) <= set(TYPE_PATTERN)
 
 
 def match_type(attribute_type):
@@ -549,12 +550,12 @@ def compile_attribute(line, in_key, foreign_key_sql, context):
         match["comment"] = ":{type}:{comment}".format(**match)  # insert custom type into comment
         substitute_special_type(match, category, foreign_key_sql, context)
 
-    if category in SERIALIZED_TYPES and match["default"] not in {
+    if category in BINARY_TYPES and match["default"] not in {
         "DEFAULT NULL",
         "NOT NULL",
     }:
         raise DataJointError(
-            "The default value for a blob or attachment attributes can only be NULL in:\n{line}".format(line=line)
+            "The default value for blob or attachment attributes can only be NULL in:\n{line}".format(line=line)
         )
 
     sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match)

From 2de222ad8e94307b4c9049aab08e9e4c23e5b487 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 20:42:28 +0000
Subject: [PATCH 27/41] Simplify type system: only core types and
 AttributeTypes

Major simplification of the type system to two categories:
1. Core DataJoint types (no brackets): float32, uuid, bool, json, blob, etc.
2. AttributeTypes (angle brackets): <djblob>, <object>, <attach>, etc.

Changes:
- declare.py: Remove EXTERNAL_TYPES, BINARY_TYPES; simplify to CORE_TYPE_ALIASES + ADAPTED
- heading.py: Remove is_attachment, is_filepath, is_object, is_external flags
- fetch.py: Simplify _get() to only handle uuid, json, blob, and adapters
- table.py: Simplify __make_placeholder() to only handle uuid, json, blob, numeric
- preview.py: Remove special object field handling (will be AttributeType)
- staged_insert.py: Update object type check to use adapter

All special handling (attach, filepath, object, external storage) will be
implemented as built-in AttributeTypes in subsequent phases.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/declare.py       | 117 +++++++++++----------------------
 src/datajoint/fetch.py         | 110 +++++++++++--------------------
 src/datajoint/heading.py       | 103 ++++++++++-------------------
 src/datajoint/preview.py       |   6 +-
 src/datajoint/staged_insert.py |   5 +-
 src/datajoint/table.py         |  51 +++++++-------
 6 files changed, 140 insertions(+), 252 deletions(-)

diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index df89dede2..a333d5f87 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -11,13 +11,13 @@
 
 from .attribute_adapter import get_adapter
 from .condition import translate_attribute
-from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types
+from .errors import DataJointError
 from .settings import config
 
-UUID_DATA_TYPE = "binary(16)"
-
-# Type aliases for numeric types
-SQL_TYPE_ALIASES = {
+# Core DataJoint type aliases - scientist-friendly names mapped to native SQL types
+# These types can be used without angle brackets in table definitions
+CORE_TYPE_ALIASES = {
+    # Numeric types
     "FLOAT32": "float",
     "FLOAT64": "double",
     "INT64": "bigint",
@@ -29,18 +29,22 @@
     "INT8": "tinyint",
     "UINT8": "tinyint unsigned",
     "BOOL": "tinyint",
+    # UUID type
+    "UUID": "binary(16)",
 }
+
 MAX_TABLE_NAME_LENGTH = 64
 CONSTANT_LITERALS = {
     "CURRENT_TIMESTAMP",
     "NULL",
 }  # SQL literals to be used without quotes (case insensitive)
-EXTERNAL_TABLE_ROOT = "~external"
 
+# Type patterns for declaration parsing
+# Two categories: core type aliases and native passthrough types
 TYPE_PATTERN = {
     k: re.compile(v, re.I)
     for k, v in dict(
-        # Type aliases must come before INTEGER and FLOAT patterns to avoid prefix matching
+        # Core DataJoint type aliases (scientist-friendly names)
         FLOAT32=r"float32$",
         FLOAT64=r"float64$",
         INT64=r"int64$",
@@ -51,8 +55,9 @@
         UINT16=r"uint16$",
         INT8=r"int8$",
         UINT8=r"uint8$",
-        BOOL=r"bool$",  # aliased to tinyint
-        # Native MySQL types
+        BOOL=r"bool$",
+        UUID=r"uuid$",
+        # Native SQL types (passthrough)
         INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$",
         DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$",
         FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$",
@@ -60,42 +65,19 @@
         JSON=r"json$",
         ENUM=r"enum\s*\(.+\)$",
         TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$",
-        INTERNAL_BLOB=r"(tiny|small|medium|long|)blob$",
-        EXTERNAL_BLOB=r"blob@(?P<store>[a-z][\-\w]*)$",
-        INTERNAL_ATTACH=r"attach$",
-        EXTERNAL_ATTACH=r"attach@(?P<store>[a-z][\-\w]*)$",
-        FILEPATH=r"filepath@(?P<store>[a-z][\-\w]*)$",
-        OBJECT=r"object(@(?P<store>[a-z][\-\w]*))?$",  # managed object storage (files/folders)
-        UUID=r"uuid$",
+        BLOB=r"(tiny|small|medium|long|)blob$",
+        # AttributeTypes use angle brackets
         ADAPTED=r"<.+>$",
     ).items()
 }
 
-# custom types are stored in attribute comment
-SPECIAL_TYPES = {
-    "UUID",
-    "INTERNAL_ATTACH",
-    "EXTERNAL_ATTACH",
-    "EXTERNAL_BLOB",
-    "FILEPATH",
-    "OBJECT",
-    "ADAPTED",
-} | set(SQL_TYPE_ALIASES)
+# Types that require special handling (stored in attribute comment for reconstruction)
+SPECIAL_TYPES = {"ADAPTED"} | set(CORE_TYPE_ALIASES)
+
+# Native SQL types that pass through without modification
 NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES
-EXTERNAL_TYPES = {
-    "EXTERNAL_ATTACH",
-    "EXTERNAL_BLOB",
-    "FILEPATH",
-}  # data referenced by a UUID in external tables
-# Blob and attachment types cannot have SQL default values (other than NULL)
-BINARY_TYPES = {
-    "EXTERNAL_ATTACH",
-    "INTERNAL_ATTACH",
-    "EXTERNAL_BLOB",
-    "INTERNAL_BLOB",
-}
 
-assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, BINARY_TYPES) <= set(TYPE_PATTERN)
+assert SPECIAL_TYPES <= set(TYPE_PATTERN)
 
 
 def match_type(attribute_type):
@@ -459,50 +441,32 @@ def format_attribute(attr):
 
 def substitute_special_type(match, category, foreign_key_sql, context):
     """
+    Substitute special types with their native SQL equivalents.
+
+    Special types are:
+    - Core type aliases (float32 → float, uuid → binary(16), etc.)
+    - ADAPTED types (AttributeTypes in angle brackets)
+
     :param match: dict containing with keys "type" and "comment" -- will be modified in place
     :param category: attribute type category from TYPE_PATTERN
     :param foreign_key_sql: list of foreign key declarations to add to
     :param context: context for looking up user-defined attribute_type adapters
     """
-    if category == "UUID":
-        match["type"] = UUID_DATA_TYPE
-    elif category == "INTERNAL_ATTACH":
-        match["type"] = "LONGBLOB"
-    elif category == "OBJECT":
-        # Object type stores metadata as JSON - no foreign key to external table
-        # Extract store name if present (object@store_name syntax)
-        if "@" in match["type"]:
-            match["store"] = match["type"].split("@", 1)[1]
-        match["type"] = "JSON"
-    elif category in EXTERNAL_TYPES:
-        if category == "FILEPATH" and not _support_filepath_types():
-            raise DataJointError(
-                """
-            The filepath data type is disabled until complete validation.
-            To turn it on as experimental feature, set the environment variable
-            {env} = TRUE or upgrade datajoint.
-            """.format(env=FILEPATH_FEATURE_SWITCH)
-            )
-        match["store"] = match["type"].split("@", 1)[1]
-        match["type"] = UUID_DATA_TYPE
-        foreign_key_sql.append(
-            "FOREIGN KEY (`{name}`) REFERENCES `{{database}}`.`{external_table_root}_{store}` (`hash`) "
-            "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match)
-        )
-    elif category == "ADAPTED":
+    if category == "ADAPTED":
+        # AttributeType - resolve to underlying dtype
         attr_type, store_name = get_adapter(context, match["type"])
-        # Store the store parameter if present
         if store_name is not None:
             match["store"] = store_name
         match["type"] = attr_type.dtype
+        # Recursively resolve if dtype is also a special type
         category = match_type(match["type"])
         if category in SPECIAL_TYPES:
-            # recursive redefinition from user-defined datatypes.
             substitute_special_type(match, category, foreign_key_sql, context)
-    elif category in SQL_TYPE_ALIASES:
-        match["type"] = SQL_TYPE_ALIASES[category]
+    elif category in CORE_TYPE_ALIASES:
+        # Core type alias - substitute with native SQL type
+        match["type"] = CORE_TYPE_ALIASES[category]
     else:
-        assert False, "Unknown special type"
+        assert False, f"Unknown special type: {category}"
 
 
 def compile_attribute(line, in_key, foreign_key_sql, context):
@@ -513,7 +477,7 @@ def compile_attribute(line, in_key, foreign_key_sql, context):
     :param in_key: set to True if attribute is in primary key set
     :param foreign_key_sql: the list of foreign key declarations to add to
     :param context: context in which to look up user-defined attribute type adapterss
-    :returns: (name, sql, is_external) -- attribute name and sql code for its declaration
+    :returns: (name, sql, store) -- attribute name, sql code for its declaration, and optional store name
     """
     try:
         match = attribute_parser.parseString(line + "#", parseAll=True)
@@ -550,13 +514,10 @@ def compile_attribute(line, in_key, foreign_key_sql, context):
         match["comment"] = ":{type}:{comment}".format(**match)  # insert custom type into comment
         substitute_special_type(match, category, foreign_key_sql, context)
 
-    if category in BINARY_TYPES and match["default"] not in {
-        "DEFAULT NULL",
-        "NOT NULL",
-    }:
-        raise DataJointError(
-            "The default value for blob or attachment attributes can only be NULL in:\n{line}".format(line=line)
-        )
+    # Check for invalid default values on blob types (after type substitution)
+    final_category = match_type(match["type"])
+    if final_category == "BLOB" and match["default"] not in {"DEFAULT NULL", "NOT NULL"}:
+        raise DataJointError("The default value for blob attributes can only be NULL in:\n{line}".format(line=line))
 
     sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match)
     return match["name"], sql, match.get("store")
diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index e1b655fc0..000ab0bfd 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -1,21 +1,15 @@
-import itertools
 import json
 import numbers
-import uuid
+import uuid as uuid_module
 from functools import partial
-from pathlib import Path
 
 import numpy as np
 import pandas
 
 from datajoint.condition import Top
 
-from . import hash
 from .errors import DataJointError
-from .objectref import ObjectRef
 from .settings import config
-from .storage import StorageBackend
-from .utils import safe_write
 
 
 class key:
@@ -39,79 +33,51 @@ def to_dicts(recarray):
 
 def _get(connection, attr, data, squeeze, download_path):
     """
-    This function is called for every attribute
+    Retrieve and decode attribute data from the database.
+
+    In the simplified type system:
+    - Native types pass through unchanged
+    - JSON types are parsed
+    - UUID types are converted from bytes
+    - Blob types return raw bytes (unless an adapter handles them)
+    - Adapters (AttributeTypes) handle all custom encoding/decoding
 
     :param connection: a dj.Connection object
-    :param attr: attribute name from the table's heading
-    :param data: literal value fetched from the table
-    :param squeeze: if True squeeze blobs
-    :param download_path: for fetches that download data, e.g. attachments
-    :return: unpacked data
+    :param attr: attribute from the table's heading
+    :param data: raw value fetched from the database
+    :param squeeze: if True squeeze blobs (legacy, unused)
+    :param download_path: for fetches that download data (legacy, unused in simplified model)
+    :return: decoded data
     """
     if data is None:
-        return
-    if attr.is_object:
-        # Object type - return ObjectRef handle
-        json_data = json.loads(data) if isinstance(data, str) else data
-        # Get the correct backend based on store name in metadata
-        store_name = json_data.get("store")  # None for default store
-        try:
-            spec = config.get_object_store_spec(store_name)
-            backend = StorageBackend(spec)
-        except DataJointError:
-            backend = None
-        return ObjectRef.from_json(json_data, backend=backend)
+        return None
+
+    # JSON type - parse and optionally decode via adapter
     if attr.json:
-        return json.loads(data)
-
-    extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None
-
-    # apply custom attribute type decoder if present
-    def adapt(x):
-        return attr.adapter.decode(x, key=None) if attr.adapter else x
-
-    if attr.is_filepath:
-        return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0])
-    if attr.is_attachment:
-        # Steps:
-        # 1. get the attachment filename
-        # 2. check if the file already exists at download_path, verify checksum
-        # 3. if exists and checksum passes then return the local filepath
-        # 4. Otherwise, download the remote file and return the new filepath
-        _uuid = uuid.UUID(bytes=data) if attr.is_external else None
-        attachment_name = extern.get_attachment_name(_uuid) if attr.is_external else data.split(b"\0", 1)[0].decode()
-        local_filepath = Path(download_path) / attachment_name
-        if local_filepath.is_file():
-            attachment_checksum = _uuid if attr.is_external else hash.uuid_from_buffer(data)
-            if attachment_checksum == hash.uuid_from_file(local_filepath, init_string=attachment_name + "\0"):
-                return adapt(str(local_filepath))  # checksum passed, no need to download again
-            # generate the next available alias filename
-            for n in itertools.count():
-                f = local_filepath.parent / (local_filepath.stem + "_%04x" % n + local_filepath.suffix)
-                if not f.is_file():
-                    local_filepath = f
-                    break
-                if attachment_checksum == hash.uuid_from_file(f, init_string=attachment_name + "\0"):
-                    return adapt(str(f))  # checksum passed, no need to download again
-        # Save attachment
-        if attr.is_external:
-            extern.download_attachment(_uuid, attachment_name, local_filepath)
-        else:
-            # write from buffer
-            safe_write(local_filepath, data.split(b"\0", 1)[1])
-        return adapt(str(local_filepath))  # download file from remote store
+        parsed = json.loads(data)
+        if attr.adapter:
+            return attr.adapter.decode(parsed, key=None)
+        return parsed
 
+    # UUID type - convert bytes to UUID object
     if attr.uuid:
-        return adapt(uuid.UUID(bytes=data))
-    elif attr.is_blob:
-        blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data
-        # Adapters (like <djblob>) handle deserialization in decode()
-        # Without adapter, blob columns return raw bytes (no deserialization)
+        result = uuid_module.UUID(bytes=data)
         if attr.adapter:
-            return attr.adapter.decode(blob_data, key=None)
-        return blob_data  # raw bytes
-    else:
-        return adapt(data)
+            return attr.adapter.decode(result, key=None)
+        return result
+
+    # Blob type - return raw bytes or decode via adapter
+    if attr.is_blob:
+        if attr.adapter:
+            return attr.adapter.decode(data, key=None)
+        return data  # raw bytes
+
+    # Other types with adapter
+    if attr.adapter:
+        return attr.adapter.decode(data, key=None)
+
+    # Native types - pass through unchanged
+    return data
 
 
 class Fetch:
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
index cc8034cd7..07617004e 100644
--- a/src/datajoint/heading.py
+++ b/src/datajoint/heading.py
@@ -8,13 +8,11 @@
 from .attribute_adapter import get_adapter
 from .attribute_type import AttributeType
 from .declare import (
-    EXTERNAL_TYPES,
-    NATIVE_TYPES,
+    CORE_TYPE_ALIASES,
     SPECIAL_TYPES,
     TYPE_PATTERN,
-    UUID_DATA_TYPE,
 )
-from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types
+from .errors import DataJointError
 
 
 class _MissingType(AttributeType):
@@ -62,10 +60,6 @@ def decode(self, stored, *, key=None):
     uuid=False,
     json=None,
     is_blob=False,
-    is_attachment=False,
-    is_filepath=False,
-    is_object=False,
-    is_external=False,
     is_hidden=False,
     adapter=None,
     store=None,
@@ -88,11 +82,13 @@ def todict(self):
     @property
     def sql_type(self):
         """:return: datatype (as string) in database. In most cases, it is the same as self.type"""
-        return UUID_DATA_TYPE if self.uuid else self.type
+        # UUID is now a core type alias - already resolved to binary(16)
+        return self.type
 
     @property
     def sql_comment(self):
         """:return: full comment for the SQL declaration. Includes custom type specification"""
+        # UUID info is stored in the comment for reconstruction
         return (":uuid:" if self.uuid else "") + self.comment
 
     @property
@@ -167,17 +163,10 @@ def secondary_attributes(self):
     def blobs(self):
         return [k for k, v in self.attributes.items() if v.is_blob]
 
-    @property
-    def objects(self):
-        return [k for k, v in self.attributes.items() if v.is_object]
-
     @property
     def non_blobs(self):
-        return [
-            k
-            for k, v in self.attributes.items()
-            if not (v.is_blob or v.is_attachment or v.is_filepath or v.is_object or v.json)
-        ]
+        """Attributes that are not blobs or JSON (used for simple column handling)."""
+        return [k for k, v in self.attributes.items() if not (v.is_blob or v.json)]
 
     @property
     def new_attributes(self):
@@ -298,15 +287,11 @@ def _init_from_database(self):
                 autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)),
                 numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")),
                 string=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("ENUM", "TEMPORAL", "STRING")),
-                is_blob=bool(TYPE_PATTERN["INTERNAL_BLOB"].match(attr["type"])),
+                is_blob=bool(TYPE_PATTERN["BLOB"].match(attr["type"])),
                 uuid=False,
                 json=bool(TYPE_PATTERN["JSON"].match(attr["type"])),
-                is_attachment=False,
-                is_filepath=False,
-                is_object=False,
                 adapter=None,
                 store=None,
-                is_external=False,
                 attribute_expression=None,
                 is_hidden=attr["name"].startswith("_"),
             )
@@ -316,26 +301,34 @@ def _init_from_database(self):
             attr["unsupported"] = not any((attr["is_blob"], attr["numeric"], attr["numeric"]))
             attr.pop("Extra")
 
-            # process custom DataJoint types
+            # process custom DataJoint types stored in comment
             special = re.match(r":(?P<type>[^:]+):(?P<comment>.*)", attr["comment"])
             if special:
                 special = special.groupdict()
                 attr.update(special)
-            # process custom attribute types (adapted types)
+
+            # process AttributeTypes (adapted types in angle brackets)
             if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]):
                 assert context is not None, "Declaration context is not set"
                 adapter_name = special["type"]
                 try:
-                    attr.update(adapter=get_adapter(context, adapter_name))
+                    adapter_result = get_adapter(context, adapter_name)
+                    # get_adapter returns (adapter, store_name) tuple
+                    if isinstance(adapter_result, tuple):
+                        attr["adapter"], attr["store"] = adapter_result
+                    else:
+                        attr["adapter"] = adapter_result
                 except DataJointError:
                     # if no adapter, then delay the error until the first invocation
-                    attr.update(adapter=_MissingType(adapter_name))
+                    attr["adapter"] = _MissingType(adapter_name)
                 else:
-                    attr.update(type=attr["adapter"].dtype)
+                    attr["type"] = attr["adapter"].dtype
                     if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()):
                         raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.")
-                    special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES)
+                    # Update is_blob based on resolved dtype
+                    attr["is_blob"] = bool(TYPE_PATTERN["BLOB"].match(attr["type"]))
 
+            # Handle core type aliases (uuid, float32, etc.)
             if special:
                 try:
                     category = next(c for c in SPECIAL_TYPES if TYPE_PATTERN[c].match(attr["type"]))
@@ -350,46 +343,18 @@ def _init_from_database(self):
                                 url=url, **attr
                             )
                         )
-                    raise DataJointError("Unknown attribute type `{type}`".format(**attr))
-                if category == "FILEPATH" and not _support_filepath_types():
-                    raise DataJointError(
-                        """
-                        The filepath data type is disabled until complete validation.
-                        To turn it on as experimental feature, set the environment variable
-                        {env} = TRUE or upgrade datajoint.
-                        """.format(env=FILEPATH_FEATURE_SWITCH)
-                    )
-                # Extract store name for external types and object types with named stores
-                store = None
-                if category in EXTERNAL_TYPES:
-                    store = attr["type"].split("@")[1]
-                elif category == "OBJECT" and "@" in attr["type"]:
-                    store = attr["type"].split("@")[1]
-
-                attr.update(
-                    unsupported=False,
-                    is_attachment=category in ("INTERNAL_ATTACH", "EXTERNAL_ATTACH"),
-                    is_filepath=category == "FILEPATH",
-                    is_object=category == "OBJECT",
-                    # INTERNAL_BLOB is not a custom type but is included for completeness
-                    is_blob=category in ("INTERNAL_BLOB", "EXTERNAL_BLOB"),
-                    uuid=category == "UUID",
-                    is_external=category in EXTERNAL_TYPES,
-                    store=store,
-                )
+                    # Not a special type - that's fine, could be native passthrough
+                    category = None
 
-            if attr["in_key"] and any(
-                (
-                    attr["is_blob"],
-                    attr["is_attachment"],
-                    attr["is_filepath"],
-                    attr["is_object"],
-                    attr["json"],
-                )
-            ):
-                raise DataJointError(
-                    "Json, Blob, attachment, filepath, or object attributes " "are not allowed in the primary key"
-                )
+                if category == "UUID":
+                    attr["uuid"] = True
+                elif category in CORE_TYPE_ALIASES:
+                    # Core type alias - already resolved in DB
+                    pass
+
+            # Check primary key constraints
+            if attr["in_key"] and (attr["is_blob"] or attr["json"]):
+                raise DataJointError("Blob or JSON attributes are not allowed in the primary key")
 
             if attr["string"] and attr["default"] is not None and attr["default"] not in sql_literals:
                 attr["default"] = '"%s"' % attr["default"]
@@ -410,7 +375,7 @@ def _init_from_database(self):
                     attr["dtype"] = numeric_types[(t, is_unsigned)]
 
             if attr["adapter"]:
-                # restore adapted type name
+                # restore adapted type name for display
                 attr["type"] = adapter_name
 
         self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes))
diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py
index 5c61db1da..7572125e9 100644
--- a/src/datajoint/preview.py
+++ b/src/datajoint/preview.py
@@ -27,7 +27,8 @@ def _format_object_display(json_data):
 def preview(query_expression, limit, width):
     heading = query_expression.heading
     rel = query_expression.proj(*heading.non_blobs)
-    object_fields = heading.objects
+    # Object fields are AttributeTypes with adapters - not specially handled in simplified model
+    object_fields = []
     if limit is None:
         limit = config["display.limit"]
     if width is None:
@@ -87,7 +88,8 @@ def get_display_value(tup, f, idx):
 def repr_html(query_expression):
     heading = query_expression.heading
     rel = query_expression.proj(*heading.non_blobs)
-    object_fields = heading.objects
+    # Object fields are AttributeTypes with adapters - not specially handled in simplified model
+    object_fields = []
     info = heading.table_status
     tuples = rel.fetch(limit=config["display.limit"] + 1, format="array")
     has_more = len(tuples) > config["display.limit"]
diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py
index 9083bb78b..3a3d5bd17 100644
--- a/src/datajoint/staged_insert.py
+++ b/src/datajoint/staged_insert.py
@@ -98,8 +98,9 @@ def _get_storage_path(self, field: str, ext: str = "") -> str:
             raise DataJointError(f"Attribute '{field}' not found in table heading")
 
         attr = self._table.heading[field]
-        if not attr.is_object:
-            raise DataJointError(f"Attribute '{field}' is not an object type")
+        # Check if this is an object AttributeType (has adapter with "object" in type_name)
+        if not (attr.adapter and hasattr(attr.adapter, "type_name") and "object" in attr.adapter.type_name):
+            raise DataJointError(f"Attribute '{field}' is not an <object> type")
 
         # Extract primary key from rec
         primary_key = {k: self._rec[k] for k in self._table.primary_key if k in self._rec}
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 02374b9ff..170e06089 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -924,56 +924,49 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None):
         as a string to be included in the query and the value, if any, to be submitted for
         processing by mysql API.
 
+        In the simplified type system:
+        - Adapters (AttributeTypes) handle all custom encoding
+        - UUID values are converted to bytes
+        - JSON values are serialized
+        - Blob values pass through as bytes
+        - Numeric values are stringified
+
         :param name:  name of attribute to be inserted
         :param value: value of attribute to be inserted
         :param ignore_extra_fields: if True, return None for unknown fields
-        :param row: the full row dict (needed for object attributes to extract primary key)
+        :param row: the full row dict (unused in simplified model)
         """
         if ignore_extra_fields and name not in self.heading:
             return None
         attr = self.heading[name]
+
+        # Apply adapter encoding first (if present)
         if attr.adapter:
-            # Custom attribute type: validate and encode
             attr.adapter.validate(value)
             value = attr.adapter.encode(value, key=None)
+
+        # Handle NULL values
         if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))):
-            # set default value
             placeholder, value = "DEFAULT", None
-        else:  # not NULL
+        else:
             placeholder = "%s"
+            # UUID - convert to bytes
             if attr.uuid:
                 if not isinstance(value, uuid.UUID):
                     try:
                         value = uuid.UUID(value)
                     except (AttributeError, ValueError):
-                        raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name))
+                        raise DataJointError(f"badly formed UUID value {value} for attribute `{name}`")
                 value = value.bytes
-            elif attr.is_blob:
-                # Adapters (like <djblob>) handle serialization in encode()
-                # Without adapter, blob columns store raw bytes (no serialization)
-                if attr.is_external:
-                    value = self.external[attr.store].put(value).bytes
-            elif attr.is_attachment:
-                attachment_path = Path(value)
-                if attr.is_external:
-                    # value is hash of contents
-                    value = self.external[attr.store].upload_attachment(attachment_path).bytes
-                else:
-                    # value is filename + contents
-                    value = str.encode(attachment_path.name) + b"\0" + attachment_path.read_bytes()
-            elif attr.is_filepath:
-                value = self.external[attr.store].upload_filepath(value).bytes
-            elif attr.is_object:
-                # Object type - upload to object storage and return JSON metadata
-                if row is None:
-                    raise DataJointError(
-                        f"Object attribute {name} requires full row context for insert. " "This is an internal error."
-                    )
-                value = self._process_object_value(name, value, row, store_name=attr.store)
-            elif attr.numeric:
-                value = str(int(value) if isinstance(value, bool) else value)
+            # JSON - serialize to string
             elif attr.json:
                 value = json.dumps(value)
+            # Numeric - convert to string
+            elif attr.numeric:
+                value = str(int(value) if isinstance(value, bool) else value)
+            # Blob - pass through as bytes (adapters handle serialization)
+            # elif attr.is_blob: pass through unchanged
+
         return name, placeholder, value
 
     def __make_row_to_insert(self, row, field_list, ignore_extra_fields):

From f35e027525535f004aac2a5c6f3bc7340302a5b6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 20:51:16 +0000
Subject: [PATCH 28/41] =?UTF-8?q?Define=20complete=20core=20type=20system?=
 =?UTF-8?q?=20with=20blob=E2=86=92longblob=20mapping?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Core DataJoint types (fully supported, recorded in :type: comments):
- Numeric: float32, float64, int64, uint64, int32, uint32, int16, uint16, int8, uint8
- Boolean: bool
- UUID: uuid → binary(16)
- JSON: json
- Binary: blob → longblob
- Temporal: date, datetime
- String: char(n), varchar(n)
- Enumeration: enum(...)

Changes:
- declare.py: Define CORE_TYPES with (pattern, sql_mapping) pairs
- declare.py: Add warning for non-standard native type usage
- heading.py: Update to use CORE_TYPE_NAMES
- storage-types-spec.md: Update documentation to reflect core types

Native database types (text, mediumint, etc.) pass through with a warning
about non-standard usage.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 docs/src/design/tables/storage-types-spec.md |  87 +++++++-------
 src/datajoint/declare.py                     | 117 +++++++++++--------
 src/datajoint/heading.py                     |   4 +-
 3 files changed, 118 insertions(+), 90 deletions(-)

diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md
index a962ee6c8..668fdfdf5 100644
--- a/docs/src/design/tables/storage-types-spec.md
+++ b/docs/src/design/tables/storage-types-spec.md
@@ -12,19 +12,20 @@ This document defines a three-layer type architecture:
 ┌───────────────────────────────────────────────────────────────────┐
 │                     AttributeTypes (Layer 3)                       │
 │                                                                    │
-│  Built-in:  <object>  <content>  <filepath@s>  <djblob>  <xblob>  │
+│  Built-in:  <djblob>  <object>  <content>  <filepath@s>  <xblob>  │
 │  User:      <custom>  <mytype>   ...                               │
 ├───────────────────────────────────────────────────────────────────┤
 │                 Core DataJoint Types (Layer 2)                     │
 │                                                                    │
-│  int8  int16  int32  int64   float32  float64   bool   decimal    │
-│  uint8 uint16 uint32 uint64  varchar  char      uuid   date       │
-│  json  longblob  blob  timestamp  datetime  enum                   │
+│  float32  float64  int64  uint64  int32  uint32  int16  uint16    │
+│  int8  uint8  bool  uuid  json  blob  date  datetime              │
+│  char(n)  varchar(n)  enum(...)                                    │
 ├───────────────────────────────────────────────────────────────────┤
 │               Native Database Types (Layer 1)                      │
 │                                                                    │
 │  MySQL:      TINYINT  SMALLINT  INT  BIGINT  FLOAT  DOUBLE  ...   │
 │  PostgreSQL: SMALLINT INTEGER   BIGINT  REAL  DOUBLE PRECISION    │
+│  (pass through with warning for non-standard types)                │
 └───────────────────────────────────────────────────────────────────┘
 ```
 
@@ -49,61 +50,65 @@ For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead.
 Core types provide a standardized, scientist-friendly interface that works identically across
 MySQL and PostgreSQL backends. Users should prefer these over native database types.
 
+**All core types are recorded in field comments using `:type:` syntax for reconstruction.**
+
 ### Numeric Types
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` (clamped) |
-| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` |
-| `int32` | 32-bit signed | `INT` | `INTEGER` |
-| `int64` | 64-bit signed | `BIGINT` | `BIGINT` |
-| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` (checked) |
-| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` (checked) |
-| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` (checked) |
-| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` |
-| `float32` | 32-bit float | `FLOAT` | `REAL` |
-| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` |
-| `decimal(p,s)` | Fixed precision | `DECIMAL(p,s)` | `NUMERIC(p,s)` |
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `int8` | 8-bit signed | `TINYINT` |
+| `int16` | 16-bit signed | `SMALLINT` |
+| `int32` | 32-bit signed | `INT` |
+| `int64` | 64-bit signed | `BIGINT` |
+| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` |
+| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` |
+| `uint32` | 32-bit unsigned | `INT UNSIGNED` |
+| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` |
+| `float32` | 32-bit float | `FLOAT` |
+| `float64` | 64-bit float | `DOUBLE` |
 
 ### String Types
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` |
-| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` |
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `char(n)` | Fixed-length | `CHAR(n)` |
+| `varchar(n)` | Variable-length | `VARCHAR(n)` |
 
 ### Boolean
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `bool` | True/False | `TINYINT(1)` | `BOOLEAN` |
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `bool` | True/False | `TINYINT` |
 
 ### Date/Time Types
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `date` | Date only | `DATE` | `DATE` |
-| `datetime` | Date and time | `DATETIME(6)` | `TIMESTAMP` |
-| `timestamp` | Auto-updating | `TIMESTAMP` | `TIMESTAMP` |
-| `time` | Time only | `TIME` | `TIME` |
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `date` | Date only | `DATE` |
+| `datetime` | Date and time | `DATETIME` |
 
 ### Binary Types
 
-Core binary types store raw bytes without any serialization. Use `<djblob>` AttributeType
+The core `blob` type stores raw bytes without any serialization. Use `<djblob>` AttributeType
 for serialized Python objects.
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `blob` | Raw bytes up to 64KB | `BLOB` | `BYTEA` |
-| `longblob` | Raw bytes up to 4GB | `LONGBLOB` | `BYTEA` |
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `blob` | Raw bytes | `LONGBLOB` |
+
+### Other Types
+
+| Core Type | Description | MySQL |
+|-----------|-------------|-------|
+| `json` | JSON document | `JSON` |
+| `uuid` | UUID | `BINARY(16)` |
+| `enum(...)` | Enumeration | `ENUM(...)` |
 
-### Special Types
+### Native Passthrough Types
 
-| Core Type | Description | MySQL | PostgreSQL |
-|-----------|-------------|-------|------------|
-| `json` | JSON document | `JSON` | `JSONB` |
-| `uuid` | UUID | `CHAR(36)` | `UUID` |
-| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + CHECK |
+Users may use native database types directly (e.g., `text`, `mediumint auto_increment`),
+but these will generate a warning about non-standard usage. Native types are not recorded
+in field comments and may have portability issues across database backends.
 
 ## AttributeTypes (Layer 3)
 
diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index a333d5f87..c08a5fd4c 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -14,25 +14,44 @@
 from .errors import DataJointError
 from .settings import config
 
-# Core DataJoint type aliases - scientist-friendly names mapped to native SQL types
-# These types can be used without angle brackets in table definitions
-CORE_TYPE_ALIASES = {
-    # Numeric types
-    "FLOAT32": "float",
-    "FLOAT64": "double",
-    "INT64": "bigint",
-    "UINT64": "bigint unsigned",
-    "INT32": "int",
-    "UINT32": "int unsigned",
-    "INT16": "smallint",
-    "UINT16": "smallint unsigned",
-    "INT8": "tinyint",
-    "UINT8": "tinyint unsigned",
-    "BOOL": "tinyint",
-    # UUID type
-    "UUID": "binary(16)",
+# Core DataJoint types - scientist-friendly names that are fully supported
+# These are recorded in field comments using :type: syntax for reconstruction
+# Format: pattern_name -> (regex_pattern, mysql_type or None if same as matched)
+CORE_TYPES = {
+    # Numeric types (aliased to native SQL)
+    "float32": (r"float32$", "float"),
+    "float64": (r"float64$", "double"),
+    "int64": (r"int64$", "bigint"),
+    "uint64": (r"uint64$", "bigint unsigned"),
+    "int32": (r"int32$", "int"),
+    "uint32": (r"uint32$", "int unsigned"),
+    "int16": (r"int16$", "smallint"),
+    "uint16": (r"uint16$", "smallint unsigned"),
+    "int8": (r"int8$", "tinyint"),
+    "uint8": (r"uint8$", "tinyint unsigned"),
+    "bool": (r"bool$", "tinyint"),
+    # UUID (stored as binary)
+    "uuid": (r"uuid$", "binary(16)"),
+    # JSON
+    "json": (r"json$", None),  # json passes through as-is
+    # Binary (blob maps to longblob)
+    "blob": (r"blob$", "longblob"),
+    # Temporal
+    "date": (r"date$", None),
+    "datetime": (r"datetime$", None),
+    # String types (with parameters)
+    "char": (r"char\s*\(\d+\)$", None),
+    "varchar": (r"varchar\s*\(\d+\)$", None),
+    # Enumeration
+    "enum": (r"enum\s*\(.+\)$", None),
 }
 
+# Compile core type patterns
+CORE_TYPE_PATTERNS = {name: re.compile(pattern, re.I) for name, (pattern, _) in CORE_TYPES.items()}
+
+# Get SQL mapping for core types
+CORE_TYPE_SQL = {name: sql_type for name, (_, sql_type) in CORE_TYPES.items()}
+
 MAX_TABLE_NAME_LENGTH = 64
 CONSTANT_LITERALS = {
     "CURRENT_TIMESTAMP",
@@ -40,47 +59,38 @@
 }  # SQL literals to be used without quotes (case insensitive)
 
 # Type patterns for declaration parsing
-# Two categories: core type aliases and native passthrough types
 TYPE_PATTERN = {
     k: re.compile(v, re.I)
     for k, v in dict(
-        # Core DataJoint type aliases (scientist-friendly names)
-        FLOAT32=r"float32$",
-        FLOAT64=r"float64$",
-        INT64=r"int64$",
-        UINT64=r"uint64$",
-        INT32=r"int32$",
-        UINT32=r"uint32$",
-        INT16=r"int16$",
-        UINT16=r"uint16$",
-        INT8=r"int8$",
-        UINT8=r"uint8$",
-        BOOL=r"bool$",
-        UUID=r"uuid$",
-        # Native SQL types (passthrough)
+        # Core DataJoint types
+        **{name.upper(): pattern for name, (pattern, _) in CORE_TYPES.items()},
+        # Native SQL types (passthrough with warning for non-standard use)
         INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$",
         DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$",
         FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$",
-        STRING=r"(var)?char\s*\(.+\)$",
-        JSON=r"json$",
-        ENUM=r"enum\s*\(.+\)$",
-        TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$",
-        BLOB=r"(tiny|small|medium|long|)blob$",
+        STRING=r"(var)?char\s*\(.+\)$",  # Catches char/varchar not matched by core types
+        TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$",  # time, timestamp, year (not date/datetime)
+        NATIVE_BLOB=r"(tiny|small|medium|long)blob$",  # Specific blob variants
+        TEXT=r"(tiny|small|medium|long)?text$",  # Text types
         # AttributeTypes use angle brackets
         ADAPTED=r"<.+>$",
     ).items()
 }
 
-# Types that require special handling (stored in attribute comment for reconstruction)
-SPECIAL_TYPES = {"ADAPTED"} | set(CORE_TYPE_ALIASES)
+# Core types are stored in attribute comment for reconstruction
+CORE_TYPE_NAMES = {name.upper() for name in CORE_TYPES}
+
+# Special types that need comment storage (core types + adapted)
+SPECIAL_TYPES = CORE_TYPE_NAMES | {"ADAPTED"}
 
-# Native SQL types that pass through without modification
+# Native SQL types that pass through (with optional warning)
 NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES
 
 assert SPECIAL_TYPES <= set(TYPE_PATTERN)
 
 
 def match_type(attribute_type):
+    """Match an attribute type string to a category."""
     try:
         return next(category for category, pattern in TYPE_PATTERN.items() if pattern.match(attribute_type))
     except StopIteration:
@@ -444,7 +454,7 @@ def substitute_special_type(match, category, foreign_key_sql, context):
     Substitute special types with their native SQL equivalents.
 
     Special types are:
-    - Core type aliases (float32 → float, uuid → binary(16), etc.)
+    - Core DataJoint types (float32 → float, uuid → binary(16), blob → longblob, etc.)
     - ADAPTED types (AttributeTypes in angle brackets)
 
     :param match: dict containing with keys "type" and "comment" -- will be modified in place
@@ -462,9 +472,13 @@ def substitute_special_type(match, category, foreign_key_sql, context):
         category = match_type(match["type"])
         if category in SPECIAL_TYPES:
             substitute_special_type(match, category, foreign_key_sql, context)
-    elif category in CORE_TYPE_ALIASES:
-        # Core type alias - substitute with native SQL type
-        match["type"] = CORE_TYPE_ALIASES[category]
+    elif category in CORE_TYPE_NAMES:
+        # Core DataJoint type - substitute with native SQL type if mapping exists
+        core_name = category.lower()
+        sql_type = CORE_TYPE_SQL.get(core_name)
+        if sql_type is not None:
+            match["type"] = sql_type
+        # else: type passes through as-is (json, date, datetime, char, varchar, enum)
     else:
         assert False, f"Unknown special type: {category}"
 
@@ -510,13 +524,22 @@ def compile_attribute(line, in_key, foreign_key_sql, context):
         raise DataJointError('An attribute comment must not start with a colon in comment "{comment}"'.format(**match))
 
     category = match_type(match["type"])
+
     if category in SPECIAL_TYPES:
-        match["comment"] = ":{type}:{comment}".format(**match)  # insert custom type into comment
+        # Core types and AttributeTypes are recorded in comment for reconstruction
+        match["comment"] = ":{type}:{comment}".format(**match)
         substitute_special_type(match, category, foreign_key_sql, context)
+    elif category in NATIVE_TYPES:
+        # Non-standard native type - warn user
+        logger.warning(
+            f"Non-standard native type '{match['type']}' in attribute '{match['name']}'. "
+            "Consider using a core DataJoint type for better portability."
+        )
 
     # Check for invalid default values on blob types (after type substitution)
-    final_category = match_type(match["type"])
-    if final_category == "BLOB" and match["default"] not in {"DEFAULT NULL", "NOT NULL"}:
+    # Note: blob → longblob, so check for NATIVE_BLOB or longblob result
+    final_type = match["type"].lower()
+    if ("blob" in final_type) and match["default"] not in {"DEFAULT NULL", "NOT NULL"}:
         raise DataJointError("The default value for blob attributes can only be NULL in:\n{line}".format(line=line))
 
     sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match)
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
index 07617004e..9750b84f3 100644
--- a/src/datajoint/heading.py
+++ b/src/datajoint/heading.py
@@ -8,7 +8,7 @@
 from .attribute_adapter import get_adapter
 from .attribute_type import AttributeType
 from .declare import (
-    CORE_TYPE_ALIASES,
+    CORE_TYPE_NAMES,
     SPECIAL_TYPES,
     TYPE_PATTERN,
 )
@@ -348,7 +348,7 @@ def _init_from_database(self):
 
                 if category == "UUID":
                     attr["uuid"] = True
-                elif category in CORE_TYPE_ALIASES:
+                elif category in CORE_TYPE_NAMES:
                     # Core type alias - already resolved in DB
                     pass
 

From 746108a63072650b9fdd8b6df13e74acff12066b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 20:55:51 +0000
Subject: [PATCH 29/41] Implement Phase 2: Content-Addressed Storage

Add content-addressed storage with deduplication for the <content> and
<xblob> AttributeTypes.

New files:
- content_registry.py: Content storage utilities
  - compute_content_hash(): SHA256 hashing
  - build_content_path(): Hierarchical path generation (_content/xx/yy/hash)
  - put_content(): Store with deduplication
  - get_content(): Retrieve with hash verification
  - content_exists(), delete_content(), get_content_size()

New built-in AttributeTypes in attribute_type.py:
- ContentType (<content>): Content-addressed storage for raw bytes
  - dtype = "json" (stores metadata: hash, store, size)
  - Automatic deduplication via SHA256 hashing
- XBlobType (<xblob>): Serialized blobs with external storage
  - dtype = "<content>" (composition with ContentType)
  - Combines djblob serialization with content-addressed storage

Updated insert/fetch for type chain support:
- table.py: Apply encoder chain from outermost to innermost
- fetch.py: Apply decoder chain from innermost to outermost
- Both pass store_name through the chain for external storage

Example usage:
  data : <content@mystore>    # Raw bytes, deduplicated
  array : <xblob@mystore>     # Serialized objects, deduplicated

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/attribute_type.py   | 169 ++++++++++++++++++++++++++
 src/datajoint/content_registry.py | 193 ++++++++++++++++++++++++++++++
 src/datajoint/fetch.py            |  45 ++++---
 src/datajoint/table.py            |  19 ++-
 4 files changed, 405 insertions(+), 21 deletions(-)
 create mode 100644 src/datajoint/content_registry.py

diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index 97ca54646..2c06ccc83 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -570,6 +570,173 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
         return blob.unpack(stored, squeeze=False)
 
 
+class ContentType(AttributeType):
+    """
+    Built-in type for content-addressed storage with deduplication.
+
+    The ``<content>`` type stores data using content-addressed storage. Data is
+    identified by its SHA256 hash and stored in a hierarchical directory structure.
+    Duplicate content is automatically deduplicated - storing the same bytes twice
+    will only create one copy in storage.
+
+    The database column stores JSON metadata including the content hash, store name,
+    and size. The actual content is stored in external storage.
+
+    This type is primarily used as a building block for other types like ``<xblob>``
+    and ``<xattach>``, but can also be used directly for raw binary content.
+
+    Example:
+        @schema
+        class RawContent(dj.Manual):
+            definition = '''
+            content_id : int
+            ---
+            data : <content@mystore>   # Content-addressed storage
+            '''
+
+        # Insert raw bytes
+        table.insert1({'content_id': 1, 'data': b'raw binary content'})
+
+        # Fetch returns the original bytes
+        data = (table & 'content_id=1').fetch1('data')
+        assert data == b'raw binary content'
+
+    Storage Structure:
+        Content is stored at: ``_content/{hash[:2]}/{hash[2:4]}/{hash}``
+        This hierarchical structure prevents too many files in a single directory.
+
+    Note:
+        The store parameter is required for ``<content>`` unless a default store
+        is configured. Use ``<content@store_name>`` syntax to specify the store.
+    """
+
+    type_name = "content"
+    dtype = "json"
+
+    def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
+        """
+        Store content and return metadata.
+
+        Computes the SHA256 hash of the content and stores it using content-addressed
+        storage. If content with the same hash already exists, it is not re-uploaded
+        (deduplication).
+
+        Args:
+            value: Raw bytes to store.
+            key: Primary key values (unused for content storage).
+            store_name: Store to use. If None, uses default store from config.
+
+        Returns:
+            Metadata dict with keys: hash, store, size
+
+        Raises:
+            TypeError: If value is not bytes.
+        """
+        if not isinstance(value, bytes):
+            raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
+
+        from .content_registry import put_content
+
+        return put_content(value, store_name=store_name)
+
+    def decode(self, stored: dict, *, key: dict | None = None) -> bytes:
+        """
+        Retrieve content by its hash.
+
+        Args:
+            stored: Metadata dict with 'hash' and optionally 'store' keys.
+            key: Primary key values (unused for content retrieval).
+
+        Returns:
+            The original bytes.
+
+        Raises:
+            MissingExternalFile: If content is not found.
+            DataJointError: If hash verification fails.
+        """
+        from .content_registry import get_content
+
+        content_hash = stored["hash"]
+        store_name = stored.get("store")
+        return get_content(content_hash, store_name=store_name)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is bytes."""
+        if not isinstance(value, bytes):
+            raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
+
+
+class XBlobType(AttributeType):
+    """
+    Built-in type for externally-stored serialized blobs with deduplication.
+
+    The ``<xblob>`` type combines DataJoint's blob serialization with content-addressed
+    storage. Objects are serialized using the djblob format, then stored externally
+    using content-addressed storage for automatic deduplication.
+
+    This type is ideal for large objects (NumPy arrays, pandas DataFrames, etc.)
+    that may be duplicated across multiple rows.
+
+    Example:
+        @schema
+        class LargeArrays(dj.Manual):
+            definition = '''
+            array_id : int
+            ---
+            data : <xblob@mystore>   # External serialized blob with deduplication
+            '''
+
+        # Insert NumPy array
+        import numpy as np
+        table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)})
+
+        # Fetch returns the original array
+        data = (table & 'array_id=1').fetch1('data')
+
+    Note:
+        - For internal storage (in database), use ``<djblob>``
+        - For external storage without serialization, use ``<content>``
+        - The store parameter is required unless a default store is configured
+    """
+
+    type_name = "xblob"
+    dtype = "<content>"  # Composition: uses ContentType for storage
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """
+        Serialize a Python object to bytes.
+
+        The object is serialized using DataJoint's blob format. The resulting
+        bytes are then passed to the underlying ``<content>`` type for storage.
+
+        Args:
+            value: Any serializable Python object.
+            key: Primary key values (unused).
+            store_name: Store parameter (passed through to content storage).
+
+        Returns:
+            Serialized bytes (will be stored by ContentType).
+        """
+        from . import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """
+        Deserialize bytes back to a Python object.
+
+        Args:
+            stored: Serialized bytes retrieved from content storage.
+            key: Primary key values (unused).
+
+        Returns:
+            The deserialized Python object.
+        """
+        from . import blob
+
+        return blob.unpack(stored, squeeze=False)
+
+
 def _register_builtin_types() -> None:
     """
     Register DataJoint's built-in attribute types.
@@ -577,6 +744,8 @@ def _register_builtin_types() -> None:
     Called automatically during module initialization.
     """
     register_type(DJBlobType)
+    register_type(ContentType)
+    register_type(XBlobType)
 
 
 # Register built-in types when module is loaded
diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py
new file mode 100644
index 000000000..5ff98e917
--- /dev/null
+++ b/src/datajoint/content_registry.py
@@ -0,0 +1,193 @@
+"""
+Content-addressed storage registry for DataJoint.
+
+This module provides content-addressed storage with deduplication for the <content>
+AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical
+directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash}
+
+The ContentRegistry tracks stored content for garbage collection purposes.
+"""
+
+import hashlib
+import logging
+from typing import Any
+
+from .errors import DataJointError
+from .settings import config
+from .storage import StorageBackend
+
+logger = logging.getLogger(__name__.split(".")[0])
+
+
+def compute_content_hash(data: bytes) -> str:
+    """
+    Compute SHA256 hash of content.
+
+    Args:
+        data: Content bytes
+
+    Returns:
+        Hex-encoded SHA256 hash (64 characters)
+    """
+    return hashlib.sha256(data).hexdigest()
+
+
+def build_content_path(content_hash: str) -> str:
+    """
+    Build the storage path for content-addressed storage.
+
+    Content is stored in a hierarchical structure to avoid too many files
+    in a single directory: _content/{hash[:2]}/{hash[2:4]}/{hash}
+
+    Args:
+        content_hash: SHA256 hex hash (64 characters)
+
+    Returns:
+        Relative path within the store
+    """
+    if len(content_hash) != 64:
+        raise DataJointError(f"Invalid content hash length: {len(content_hash)} (expected 64)")
+    return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
+
+
+def get_store_backend(store_name: str | None = None) -> StorageBackend:
+    """
+    Get a StorageBackend for content storage.
+
+    Args:
+        store_name: Name of the store to use. If None, uses the default store.
+
+    Returns:
+        StorageBackend instance
+    """
+    if store_name is None:
+        # Use default store from object_storage settings
+        store_name = config.object_storage.default_store
+        if store_name is None:
+            raise DataJointError(
+                "No default store configured. Set object_storage.default_store "
+                "or specify a store name explicitly."
+            )
+
+    spec = config.get_object_store_spec(store_name)
+    return StorageBackend(spec)
+
+
+def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]:
+    """
+    Store content using content-addressed storage.
+
+    If the content already exists (same hash), it is not re-uploaded.
+    Returns metadata including the hash, store, and size.
+
+    Args:
+        data: Content bytes to store
+        store_name: Name of the store. If None, uses default store.
+
+    Returns:
+        Metadata dict with keys: hash, store, size
+    """
+    content_hash = compute_content_hash(data)
+    path = build_content_path(content_hash)
+
+    backend = get_store_backend(store_name)
+
+    # Check if content already exists (deduplication)
+    if not backend.exists(path):
+        backend.put_buffer(data, path)
+        logger.debug(f"Stored new content: {content_hash[:16]}... ({len(data)} bytes)")
+    else:
+        logger.debug(f"Content already exists: {content_hash[:16]}...")
+
+    return {
+        "hash": content_hash,
+        "store": store_name,
+        "size": len(data),
+    }
+
+
+def get_content(content_hash: str, store_name: str | None = None) -> bytes:
+    """
+    Retrieve content by its hash.
+
+    Args:
+        content_hash: SHA256 hex hash of the content
+        store_name: Name of the store. If None, uses default store.
+
+    Returns:
+        Content bytes
+
+    Raises:
+        MissingExternalFile: If content is not found
+        DataJointError: If hash verification fails
+    """
+    path = build_content_path(content_hash)
+    backend = get_store_backend(store_name)
+
+    data = backend.get_buffer(path)
+
+    # Verify hash (optional but recommended for integrity)
+    actual_hash = compute_content_hash(data)
+    if actual_hash != content_hash:
+        raise DataJointError(
+            f"Content hash mismatch: expected {content_hash[:16]}..., "
+            f"got {actual_hash[:16]}..."
+        )
+
+    return data
+
+
+def content_exists(content_hash: str, store_name: str | None = None) -> bool:
+    """
+    Check if content exists in storage.
+
+    Args:
+        content_hash: SHA256 hex hash of the content
+        store_name: Name of the store. If None, uses default store.
+
+    Returns:
+        True if content exists
+    """
+    path = build_content_path(content_hash)
+    backend = get_store_backend(store_name)
+    return backend.exists(path)
+
+
+def delete_content(content_hash: str, store_name: str | None = None) -> bool:
+    """
+    Delete content from storage.
+
+    WARNING: This should only be called after verifying no references exist.
+    Use garbage collection to safely remove unreferenced content.
+
+    Args:
+        content_hash: SHA256 hex hash of the content
+        store_name: Name of the store. If None, uses default store.
+
+    Returns:
+        True if content was deleted, False if it didn't exist
+    """
+    path = build_content_path(content_hash)
+    backend = get_store_backend(store_name)
+
+    if backend.exists(path):
+        backend.remove(path)
+        logger.debug(f"Deleted content: {content_hash[:16]}...")
+        return True
+    return False
+
+
+def get_content_size(content_hash: str, store_name: str | None = None) -> int:
+    """
+    Get the size of stored content.
+
+    Args:
+        content_hash: SHA256 hex hash of the content
+        store_name: Name of the store. If None, uses default store.
+
+    Returns:
+        Size in bytes
+    """
+    path = build_content_path(content_hash)
+    backend = get_store_backend(store_name)
+    return backend.size(path)
diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py
index 000ab0bfd..d021a87d8 100644
--- a/src/datajoint/fetch.py
+++ b/src/datajoint/fetch.py
@@ -40,7 +40,10 @@ def _get(connection, attr, data, squeeze, download_path):
     - JSON types are parsed
     - UUID types are converted from bytes
     - Blob types return raw bytes (unless an adapter handles them)
-    - Adapters (AttributeTypes) handle all custom encoding/decoding
+    - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains
+
+    For composed types (e.g., <xblob> using <content>), decoders are applied
+    in reverse order: innermost first, then outermost.
 
     :param connection: a dj.Connection object
     :param attr: attribute from the table's heading
@@ -52,30 +55,36 @@ def _get(connection, attr, data, squeeze, download_path):
     if data is None:
         return None
 
-    # JSON type - parse and optionally decode via adapter
+    # Get the final storage type and type chain if adapter present
+    if attr.adapter:
+        from .attribute_type import resolve_dtype
+
+        final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>")
+
+        # First, process the final dtype (what's stored in the database)
+        if final_dtype.lower() == "json":
+            data = json.loads(data)
+        elif final_dtype.lower() in ("longblob", "blob", "mediumblob", "tinyblob"):
+            pass  # Blob data is already bytes
+        elif final_dtype.lower() == "binary(16)":
+            data = uuid_module.UUID(bytes=data)
+
+        # Apply decoders in reverse order: innermost first, then outermost
+        for attr_type in reversed(type_chain):
+            data = attr_type.decode(data, key=None)
+
+        return data
+
+    # No adapter - handle native types
     if attr.json:
-        parsed = json.loads(data)
-        if attr.adapter:
-            return attr.adapter.decode(parsed, key=None)
-        return parsed
+        return json.loads(data)
 
-    # UUID type - convert bytes to UUID object
     if attr.uuid:
-        result = uuid_module.UUID(bytes=data)
-        if attr.adapter:
-            return attr.adapter.decode(result, key=None)
-        return result
+        return uuid_module.UUID(bytes=data)
 
-    # Blob type - return raw bytes or decode via adapter
     if attr.is_blob:
-        if attr.adapter:
-            return attr.adapter.decode(data, key=None)
         return data  # raw bytes
 
-    # Other types with adapter
-    if attr.adapter:
-        return attr.adapter.decode(data, key=None)
-
     # Native types - pass through unchanged
     return data
 
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 170e06089..009d475d2 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -925,7 +925,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None):
         processing by mysql API.
 
         In the simplified type system:
-        - Adapters (AttributeTypes) handle all custom encoding
+        - Adapters (AttributeTypes) handle all custom encoding via type chains
         - UUID values are converted to bytes
         - JSON values are serialized
         - Blob values pass through as bytes
@@ -940,10 +940,23 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None):
             return None
         attr = self.heading[name]
 
-        # Apply adapter encoding first (if present)
+        # Apply adapter encoding with type chain support
         if attr.adapter:
+            from .attribute_type import resolve_dtype
+
             attr.adapter.validate(value)
-            value = attr.adapter.encode(value, key=None)
+
+            # Resolve full type chain
+            _, type_chain, resolved_store = resolve_dtype(f"<{attr.adapter.type_name}>", store_name=attr.store)
+
+            # Apply encoders from outermost to innermost
+            for attr_type in type_chain:
+                # Pass store_name to encoders that support it
+                try:
+                    value = attr_type.encode(value, key=None, store_name=resolved_store)
+                except TypeError:
+                    # Encoder doesn't accept store_name parameter
+                    value = attr_type.encode(value, key=None)
 
         # Handle NULL values
         if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))):

From 328a59a1927d10553c36224ff1aeee2874d9f1b5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 21:01:37 +0000
Subject: [PATCH 30/41] Apply ruff-format to content_registry.py

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/content_registry.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py
index 5ff98e917..01e5844cf 100644
--- a/src/datajoint/content_registry.py
+++ b/src/datajoint/content_registry.py
@@ -65,8 +65,7 @@ def get_store_backend(store_name: str | None = None) -> StorageBackend:
         store_name = config.object_storage.default_store
         if store_name is None:
             raise DataJointError(
-                "No default store configured. Set object_storage.default_store "
-                "or specify a store name explicitly."
+                "No default store configured. Set object_storage.default_store " "or specify a store name explicitly."
             )
 
     spec = config.get_object_store_spec(store_name)
@@ -129,10 +128,7 @@ def get_content(content_hash: str, store_name: str | None = None) -> bytes:
     # Verify hash (optional but recommended for integrity)
     actual_hash = compute_content_hash(data)
     if actual_hash != content_hash:
-        raise DataJointError(
-            f"Content hash mismatch: expected {content_hash[:16]}..., "
-            f"got {actual_hash[:16]}..."
-        )
+        raise DataJointError(f"Content hash mismatch: expected {content_hash[:16]}..., " f"got {actual_hash[:16]}...")
 
     return data
 

From bbbfbc38a84a78cb6f5356f4d0991c5c2d5e0ff1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 21:24:04 +0000
Subject: [PATCH 31/41] Remove legacy compatibility shims:
 attribute_adapter.py, bypass_serialization

Breaking changes:
- Remove attribute_adapter.py entirely (hard deprecate)
- Remove bypass_serialization flag from blob.py - blobs always serialize now
- Remove unused 'database' field from Attribute in heading.py

Import get_adapter from attribute_type instead of attribute_adapter.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/attribute_adapter.py | 42 ----------------------
 src/datajoint/blob.py              | 10 ------
 src/datajoint/declare.py           |  2 +-
 src/datajoint/heading.py           |  4 +--
 tests/test_bypass_serialization.py | 57 ------------------------------
 5 files changed, 2 insertions(+), 113 deletions(-)
 delete mode 100644 src/datajoint/attribute_adapter.py
 delete mode 100644 tests/test_bypass_serialization.py

diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py
deleted file mode 100644
index c92618f9e..000000000
--- a/src/datajoint/attribute_adapter.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-Attribute adapter module - compatibility shim.
-
-This module re-exports functions from attribute_type for backward compatibility
-with code that imports from attribute_adapter.
-
-.. deprecated:: 0.15
-    Import directly from :mod:`datajoint.attribute_type` instead.
-"""
-
-from .attribute_type import (
-    AttributeType,
-    get_type,
-    is_type_registered,
-    parse_type_spec,
-)
-from .errors import DataJointError
-
-
-def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]:
-    """
-    Get an attribute type by name.
-
-    Args:
-        context: Ignored (legacy parameter, kept for API compatibility).
-        adapter_name: The type name, with or without angle brackets.
-                      May include store parameter (e.g., "<xblob@cold>").
-
-    Returns:
-        Tuple of (AttributeType instance, store_name or None).
-
-    Raises:
-        DataJointError: If the type is not found.
-    """
-    # Parse type name and optional store parameter
-    type_name, store_name = parse_type_spec(adapter_name)
-
-    # Look up in the global type registry
-    if is_type_registered(type_name):
-        return get_type(type_name), store_name
-
-    raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.")
diff --git a/src/datajoint/blob.py b/src/datajoint/blob.py
index 424d88779..15364bfa4 100644
--- a/src/datajoint/blob.py
+++ b/src/datajoint/blob.py
@@ -56,8 +56,6 @@
 
 compression = {b"ZL123\0": zlib.decompress}
 
-bypass_serialization = False  # runtime setting to bypass blob (en|de)code
-
 # runtime setting to read integers as 32-bit to read blobs created by the 32-bit
 # version of the mYm library for MATLAB
 use_32bit_dims = False
@@ -507,17 +505,9 @@ def pack(self, obj, compress):
 
 
 def pack(obj, compress=True):
-    if bypass_serialization:
-        # provide a way to move blobs quickly without de/serialization
-        assert isinstance(obj, bytes) and obj.startswith((b"ZL123\0", b"mYm\0", b"dj0\0"))
-        return obj
     return Blob().pack(obj, compress=compress)
 
 
 def unpack(blob, squeeze=False):
-    if bypass_serialization:
-        # provide a way to move blobs quickly without de/serialization
-        assert isinstance(blob, bytes) and blob.startswith((b"ZL123\0", b"mYm\0", b"dj0\0"))
-        return blob
     if blob is not None:
         return Blob(squeeze=squeeze).unpack(blob)
diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py
index c08a5fd4c..68286de2c 100644
--- a/src/datajoint/declare.py
+++ b/src/datajoint/declare.py
@@ -9,7 +9,7 @@
 
 import pyparsing as pp
 
-from .attribute_adapter import get_adapter
+from .attribute_type import get_adapter
 from .condition import translate_attribute
 from .errors import DataJointError
 from .settings import config
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
index 9750b84f3..3221522fd 100644
--- a/src/datajoint/heading.py
+++ b/src/datajoint/heading.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from .attribute_adapter import get_adapter
+from .attribute_type import get_adapter
 from .attribute_type import AttributeType
 from .declare import (
     CORE_TYPE_NAMES,
@@ -65,7 +65,6 @@ def decode(self, stored, *, key=None):
     store=None,
     unsupported=False,
     attribute_expression=None,
-    database=None,
     dtype=object,
 )
 
@@ -282,7 +281,6 @@ def _init_from_database(self):
         for attr in attributes:
             attr.update(
                 in_key=(attr["in_key"] == "PRI"),
-                database=database,
                 nullable=attr["nullable"] == "YES",
                 autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)),
                 numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")),
diff --git a/tests/test_bypass_serialization.py b/tests/test_bypass_serialization.py
deleted file mode 100644
index da7f0b0e3..000000000
--- a/tests/test_bypass_serialization.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-
-import datajoint as dj
-
-test_blob = np.array([1, 2, 3])
-
-
-class Input(dj.Lookup):
-    definition = """
-    id:                 int
-    ---
-    data:               blob
-    """
-    contents = [(0, test_blob)]
-
-
-class Output(dj.Manual):
-    definition = """
-    id:                 int
-    ---
-    data:               blob
-    """
-
-
-@pytest.fixture
-def schema_in(connection_test, prefix):
-    schema = dj.Schema(
-        prefix + "_test_bypass_serialization_in",
-        context=dict(Input=Input),
-        connection=connection_test,
-    )
-    schema(Input)
-    yield schema
-    schema.drop()
-
-
-@pytest.fixture
-def schema_out(connection_test, prefix):
-    schema = dj.Schema(
-        prefix + "_test_blob_bypass_serialization_out",
-        context=dict(Output=Output),
-        connection=connection_test,
-    )
-    schema(Output)
-    yield schema
-    schema.drop()
-
-
-def test_bypass_serialization(schema_in, schema_out):
-    dj.blob.bypass_serialization = True
-    contents = Input.fetch(as_dict=True)
-    assert isinstance(contents[0]["data"], bytes)
-    Output.insert(contents)
-    dj.blob.bypass_serialization = False
-    assert_array_equal(Input.fetch1("data"), Output.fetch1("data"))

From 3c4608fc94cc9fd33f4fcc5c8601b5a4a725c301 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:12:19 +0000
Subject: [PATCH 32/41] Update implementation plan to reflect actual
 implementation

- Document function-based content storage (not registry class)
- Add implementation status table
- Explain design decision: functions vs database table
- Update Phase 5 GC design for scanning approach
- Document removed/deprecated items

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      | 595 ++++++------------
 1 file changed, 194 insertions(+), 401 deletions(-)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index 13d2e45d3..a425837eb 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -7,187 +7,150 @@ This plan describes the implementation of a three-layer type architecture for Da
 1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes)
 2. Implement content-addressed storage with deduplication
 3. Provide composable, user-friendly types (`<xblob>`, `<xattach>`, `<filepath@store>`)
-4. Enable project-wide garbage collection via `ContentRegistry`
+4. Enable project-wide garbage collection
 5. Maintain backward compatibility with existing schemas
 
 ---
 
-## Phase 1: Core Type System Foundation
+## Implementation Status
 
-**Goal**: Establish the complete Layer 2 core type mappings and enhance the AttributeType infrastructure.
+| Phase | Status | Notes |
+|-------|--------|-------|
+| Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution |
+| Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table |
+| Phase 3: User-Defined AttributeTypes | 🔲 Pending | XBlobType done, AttachType/FilepathType pending |
+| Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding |
+| Phase 5: Garbage Collection | 🔲 Pending | |
+| Phase 6: Migration Utilities | 🔲 Pending | |
+| Phase 7: Documentation and Testing | 🔲 Pending | |
 
-### 1.1 Expand Core Type Mappings
-
-**Files to modify:**
-- `src/datajoint/declare.py`
-
-**Current state**: `SQL_TYPE_ALIASES` already maps some types (float32, int32, etc.)
-
-**Changes needed**:
-1. Complete the type mappings as per spec:
-   ```
-   Core Type -> MySQL Type
-   int8      -> TINYINT
-   uint8     -> TINYINT UNSIGNED
-   int16     -> SMALLINT
-   ...
-   json      -> JSON
-   uuid      -> BINARY(16) or CHAR(36)
-   decimal   -> DECIMAL(p,s)
-   ```
-
-2. Add PostgreSQL mappings for future support (can be placeholder initially)
-
-**Dependencies**: None
-
-### 1.2 Enhance AttributeType with Store Parameter Support
+---
 
-**Files to modify:**
-- `src/datajoint/attribute_type.py`
+## Phase 1: Core Type System Foundation ✅
 
-**Current state**: Types don't support `@store` parameter syntax
+**Status**: Complete
 
-**Changes needed**:
-1. Add `store_name` property to `AttributeType`
-2. Modify `resolve_dtype()` to handle `<type@store>` syntax
-3. Add `get_type_with_store(name_with_store)` helper that parses `xblob@cold` format
+### Implemented in `src/datajoint/declare.py`:
 
 ```python
-def parse_type_spec(spec: str) -> tuple[str, str | None]:
-    """Parse '<type@store>' or '<type>' into (type_name, store_name)."""
-    spec = spec.strip("<>")
-    if "@" in spec:
-        type_name, store_name = spec.split("@", 1)
-        return type_name, store_name
-    return spec, None
+CORE_TYPES = {
+    # Numeric types (aliased to native SQL)
+    "float32": (r"float32$", "float"),
+    "float64": (r"float64$", "double"),
+    "int64": (r"int64$", "bigint"),
+    "uint64": (r"uint64$", "bigint unsigned"),
+    "int32": (r"int32$", "int"),
+    "uint32": (r"uint32$", "int unsigned"),
+    "int16": (r"int16$", "smallint"),
+    "uint16": (r"uint16$", "smallint unsigned"),
+    "int8": (r"int8$", "tinyint"),
+    "uint8": (r"uint8$", "tinyint unsigned"),
+    "bool": (r"bool$", "tinyint"),
+    # UUID (stored as binary)
+    "uuid": (r"uuid$", "binary(16)"),
+    # JSON
+    "json": (r"json$", None),
+    # Binary (blob maps to longblob)
+    "blob": (r"blob$", "longblob"),
+    # Temporal
+    "date": (r"date$", None),
+    "datetime": (r"datetime$", None),
+    # String types (with parameters)
+    "char": (r"char\s*\(\d+\)$", None),
+    "varchar": (r"varchar\s*\(\d+\)$", None),
+    # Enumeration
+    "enum": (r"enum\s*\(.+\)$", None),
+}
 ```
 
-**Dependencies**: None
+### Key changes:
+- Removed `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES`
+- Core types are recorded in field comments with `:type:` syntax
+- Non-standard native types pass through with warning
+- `parse_type_spec()` handles `<type@store>` syntax
+- `resolve_dtype()` returns `(final_dtype, type_chain, store_name)` tuple
 
-### 1.3 Update Heading and Declaration Parsing
+---
 
-**Files to modify:**
-- `src/datajoint/heading.py`
-- `src/datajoint/declare.py`
+## Phase 2: Content-Addressed Storage ✅
 
-**Changes needed**:
-1. Update `TYPE_PATTERN` to recognize new AttributeType patterns
-2. Store `store_name` in attribute metadata for parameterized types
-3. Update `compile_attribute()` to handle `<type@store>` syntax
-4. Update `_init_from_database()` to reconstruct store information
+**Status**: Complete (simplified design)
 
-**Dependencies**: Phase 1.2
+### Design Decision: Functions vs Class
 
----
+The original plan proposed a `ContentRegistry` class with a database table. We implemented a simpler, stateless approach using functions in `content_registry.py`:
 
-## Phase 2: Content-Addressed Storage Implementation
+**Why functions instead of a registry table:**
+1. **Simpler** - No additional database table to manage
+2. **Decoupled** - Content storage is independent of any schema
+3. **GC by scanning** - Garbage collection scans tables for references rather than maintaining reference counts
+4. **Less state** - No synchronization issues between registry and actual storage
 
-**Goal**: Implement the `<content>` type with content-addressed storage and deduplication.
+### Implemented in `src/datajoint/content_registry.py`:
 
-### 2.1 Create ContentRegistry Table
+```python
+def compute_content_hash(data: bytes) -> str:
+    """Compute SHA256 hash of content."""
+    return hashlib.sha256(data).hexdigest()
 
-**New file to create:**
-- `src/datajoint/content_registry.py`
+def build_content_path(content_hash: str) -> str:
+    """Build path: _content/{hash[:2]}/{hash[2:4]}/{hash}"""
+    return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
 
-**Implementation**:
-```python
-class ContentRegistry:
-    """
-    Project-level content registry for content-addressed storage.
-    Stored in a designated database (e.g., `{project}_content`).
-    """
-    definition = """
-    # Content-addressed object registry (project-wide)
-    content_hash : char(64)          # SHA256 hex
-    ---
-    store        : varchar(64)       # Store name
-    size         : bigint unsigned   # Size in bytes
-    created      : timestamp DEFAULT CURRENT_TIMESTAMP
-    """
-```
+def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]:
+    """Store content with deduplication. Returns {hash, store, size}."""
+    ...
 
-Key features:
-- Auto-create the registry database on first use
-- Methods: `insert_content()`, `get_content()`, `increment_ref()`, `decrement_ref()`
-- Thread-safe reference counting (if needed)
+def get_content(content_hash: str, store_name: str | None = None) -> bytes:
+    """Retrieve content by hash with verification."""
+    ...
 
-**Dependencies**: None
+def content_exists(content_hash: str, store_name: str | None = None) -> bool:
+    """Check if content exists."""
+    ...
 
-### 2.2 Implement ContentType AttributeType
+def delete_content(content_hash: str, store_name: str | None = None) -> bool:
+    """Delete content (use with caution - verify no references first)."""
+    ...
+```
 
-**Files to modify:**
-- `src/datajoint/attribute_type.py`
+### Implemented AttributeTypes in `src/datajoint/attribute_type.py`:
 
-**New built-in type**:
 ```python
 class ContentType(AttributeType):
-    """Built-in AttributeType for content-addressed storage."""
+    """Content-addressed storage. Stores bytes, returns JSON metadata."""
     type_name = "content"
     dtype = "json"
 
-    def encode(self, data: bytes, *, key=None, store_name=None) -> dict:
-        """Store content, return metadata as JSON."""
-        content_hash = hashlib.sha256(data).hexdigest()
-        path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}"
-        # Store if not exists, register in ContentRegistry
-        ...
-        return {"hash": content_hash, "store": store_name, "size": len(data)}
+    def encode(self, value: bytes, *, key=None, store_name=None) -> dict:
+        return put_content(value, store_name=store_name)
 
     def decode(self, stored: dict, *, key=None) -> bytes:
-        """Retrieve content by hash."""
-        ...
-```
-
-**Dependencies**: Phase 2.1
-
-### 2.3 Implement Content Storage Backend Methods
-
-**Files to modify:**
-- `src/datajoint/storage.py`
-
-**Changes needed**:
-1. Add `put_content()` method with deduplication
-2. Add `get_content()` method with hash verification
-3. Add `compute_content_hash()` utility
-4. Add content path generation: `_content/{hash[:2]}/{hash[2:4]}/{hash}`
+        return get_content(stored["hash"], store_name=stored.get("store"))
 
-**Dependencies**: None
 
----
-
-## Phase 3: User-Defined AttributeTypes
-
-**Goal**: Implement the standard user-facing types that compose with `<content>` and `<object>`.
-
-### 3.1 Implement XBlobType (External Blob)
-
-**Files to modify:**
-- `src/datajoint/attribute_type.py`
-
-```python
-@register_type
 class XBlobType(AttributeType):
     """External serialized blob using content-addressed storage."""
     type_name = "xblob"
-    dtype = "<content>"  # Composition: uses ContentType
+    dtype = "<content>"  # Composition
 
-    def encode(self, value, *, key=None) -> bytes:
-        from . import blob
+    def encode(self, value, *, key=None, store_name=None) -> bytes:
         return blob.pack(value, compress=True)
 
-    def decode(self, stored, *, key=None) -> Any:
-        from . import blob
-        return blob.unpack(stored)
+    def decode(self, stored: bytes, *, key=None) -> Any:
+        return blob.unpack(stored, squeeze=False)
 ```
 
-**Key behavior**: Serializes to djblob format, stores via content-addressed storage
+---
 
-**Dependencies**: Phase 2.2
+## Phase 3: User-Defined AttributeTypes
 
-### 3.2 Implement AttachType and XAttachType
+**Status**: Partially complete
 
-**Files to modify:**
-- `src/datajoint/attribute_type.py`
+### 3.1 XBlobType ✅
+Implemented as shown above. Composes with `<content>`.
+
+### 3.2 AttachType and XAttachType 🔲
 
 ```python
 @register_type
@@ -210,22 +173,10 @@ class XAttachType(AttributeType):
     """External file attachment using content-addressed storage."""
     type_name = "xattach"
     dtype = "<content>"
-
-    def encode(self, filepath, *, key=None) -> bytes:
-        path = Path(filepath)
-        return path.name.encode() + b"\0" + path.read_bytes()
-
-    def decode(self, stored, *, key=None) -> str:
-        # Same as AttachType.decode()
-        ...
+    # Similar to AttachType but composes with content storage
 ```
 
-**Dependencies**: Phase 2.2
-
-### 3.3 Implement FilepathType
-
-**Files to modify:**
-- `src/datajoint/attribute_type.py`
+### 3.3 FilepathType 🔲
 
 ```python
 @register_type
@@ -234,337 +185,179 @@ class FilepathType(AttributeType):
     type_name = "filepath"
     dtype = "json"
 
-    def encode(self, relative_path: str, *, key=None, store_name=None,
-               compute_checksum: bool = False) -> dict:
+    def encode(self, relative_path: str, *, key=None, store_name=None) -> dict:
         """Register reference to file in store."""
-        store = get_store(store_name)  # Required for filepath
-        metadata = {'path': relative_path, 'store': store_name}
-        if compute_checksum:
-            # Compute checksum and size
-            ...
-        return metadata
+        return {'path': relative_path, 'store': store_name}
 
     def decode(self, stored: dict, *, key=None) -> ObjectRef:
         """Return ObjectRef for lazy access."""
-        return ObjectRef(
-            store=get_store(stored['store']),
-            path=stored['path'],
-            checksum=stored.get('checksum')
-        )
+        return ObjectRef(store=stored['store'], path=stored['path'])
 ```
 
-**Key difference from legacy**: Returns `ObjectRef` instead of copying to local stage
-
-**Dependencies**: Existing `ObjectRef` and `StorageBackend`
-
 ---
 
-## Phase 4: Insert and Fetch Integration
+## Phase 4: Insert and Fetch Integration ✅
 
-**Goal**: Update the data path to handle the new type system seamlessly.
+**Status**: Complete
 
-### 4.1 Update Insert Processing
-
-**Files to modify:**
-- `src/datajoint/table.py`
-
-**Changes needed in `__make_placeholder()`**:
-1. Handle type composition (resolve full type chain)
-2. Pass `store_name` to `encode()` when applicable
-3. Handle `<content>` type's special behavior
-4. Process `<filepath@store>` with store parameter
+### Updated in `src/datajoint/table.py`:
 
 ```python
 def __make_placeholder(self, name, value, ...):
-    attr = self.heading[name]
     if attr.adapter:
-        # Resolve type chain and pass store_name
-        final_dtype, type_chain = resolve_dtype(attr.adapter.dtype)
-        store_name = attr.store
-
-        # Apply type chain: outer -> inner
+        from .attribute_type import resolve_dtype
+        attr.adapter.validate(value)
+        _, type_chain, resolved_store = resolve_dtype(
+            f"<{attr.adapter.type_name}>", store_name=attr.store
+        )
+        # Apply type chain: outermost → innermost
         for attr_type in type_chain:
-            value = attr_type.encode(value, key=key, store_name=store_name)
-
-        # Continue with final_dtype processing
-        ...
+            try:
+                value = attr_type.encode(value, key=None, store_name=resolved_store)
+            except TypeError:
+                value = attr_type.encode(value, key=None)
 ```
 
-**Dependencies**: Phases 1-3
-
-### 4.2 Update Fetch Processing
-
-**Files to modify:**
-- `src/datajoint/fetch.py`
-
-**Changes needed in `_get()`**:
-1. Handle `<content>` type: retrieve from content store
-2. Handle type composition: apply decoders in reverse order
-3. Handle `<filepath@store>`: return `ObjectRef` instead of downloading
+### Updated in `src/datajoint/fetch.py`:
 
 ```python
 def _get(connection, attr, data, squeeze, download_path):
     if attr.adapter:
-        final_dtype, type_chain = resolve_dtype(attr.adapter.dtype)
+        from .attribute_type import resolve_dtype
+        final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>")
 
-        # Process based on final_dtype
-        if final_dtype == "json":
+        # Parse JSON if final storage is JSON
+        if final_dtype.lower() == "json":
             data = json.loads(data)
-        elif final_dtype == "longblob":
-            # Handle content retrieval if needed
-            ...
 
-        # Apply type chain in reverse: inner -> outer
+        # Apply type chain in reverse: innermost → outermost
         for attr_type in reversed(type_chain):
-            data = attr_type.decode(data, key=key)
+            data = attr_type.decode(data, key=None)
 
         return data
 ```
 
-**Dependencies**: Phases 1-3
-
-### 4.3 Update Heading Attribute Properties
-
-**Files to modify:**
-- `src/datajoint/heading.py`
-
-**Changes needed**:
-1. Add `is_content` property for content-addressed attributes
-2. Update property detection logic for new types
-3. Store composed type information for fetch/insert
-
-**Dependencies**: Phase 1.3
-
 ---
 
-## Phase 5: Garbage Collection
+## Phase 5: Garbage Collection 🔲
 
-**Goal**: Implement project-wide garbage collection for content-addressed storage.
+**Status**: Pending
 
-### 5.1 Implement GC Scanner
+### Design (updated for function-based approach):
 
-**New file to create:**
-- `src/datajoint/gc.py`
+Since we don't have a registry table, GC works by scanning:
 
 ```python
-def scan_content_references(project) -> set[tuple[str, str]]:
+def scan_content_references(schemas: list) -> set[tuple[str, str]]:
     """
-    Scan all schemas in project for content references.
+    Scan all schemas for content references.
 
     Returns:
         Set of (content_hash, store) tuples that are referenced
     """
     referenced = set()
-    for schema in project.schemas:
+    for schema in schemas:
         for table in schema.tables:
             for attr in table.heading.attributes:
-                if attr.type in ('content', 'xblob', 'xattach'):
-                    hashes = table.fetch(attr.name)
-                    for h in hashes:
-                        if isinstance(h, dict):
-                            referenced.add((h['hash'], h.get('store')))
+                if uses_content_storage(attr):
+                    # Fetch all JSON metadata from this column
+                    for row in table.fetch(attr.name):
+                        if isinstance(row, dict) and 'hash' in row:
+                            referenced.add((row['hash'], row.get('store')))
     return referenced
 
-def garbage_collect(project, dry_run=True) -> dict:
+def list_stored_content(store_name: str) -> set[str]:
+    """List all content hashes in a store by scanning _content/ directory."""
+    ...
+
+def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
     """
     Remove unreferenced content from storage.
 
     Returns:
         Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B}
     """
-    ...
-```
-
-**Dependencies**: Phase 2.1
-
-### 5.2 Add GC CLI Commands
-
-**Files to modify:**
-- CLI or management interface
+    referenced = scan_content_references(schemas)
+    stored = list_stored_content(store_name)
+    orphaned = stored - {h for h, s in referenced if s == store_name}
 
-**New commands**:
-- `dj gc scan` - Scan and report orphaned content
-- `dj gc clean` - Remove orphaned content
-- `dj gc status` - Show content registry status
-
-**Dependencies**: Phase 5.1
-
----
+    if not dry_run:
+        for content_hash in orphaned:
+            delete_content(content_hash, store_name)
 
-## Phase 6: Migration Utilities
-
-**Goal**: Provide tools to migrate existing schemas to the new type system.
-
-### 6.1 Enhance Migration Module
-
-**Files to modify:**
-- `src/datajoint/migrate.py`
-
-**New functions**:
-
-```python
-def analyze_external_stores(schema) -> list[dict]:
-    """Analyze legacy ~external_* tables for migration."""
-    ...
-
-def migrate_external_to_content(schema, store_name, dry_run=True) -> dict:
-    """
-    Migrate legacy ~external_{store} to new ContentRegistry.
-
-    Steps:
-    1. Read entries from ~external_{store}
-    2. For each entry: fetch content, compute SHA256
-    3. Copy to _content/{hash}/ if not exists
-    4. Update referencing tables (UUID -> hash JSON)
-    5. Register in ContentRegistry
-    """
-    ...
-
-def migrate_blob_to_djblob(schema, dry_run=True) -> dict:
-    """Update implicit blob columns to use <djblob>."""
-    ...
-
-def migrate_filepath_to_new(schema, dry_run=True) -> dict:
-    """
-    Migrate legacy filepath@store to new <filepath@store>.
-
-    Changes:
-    - UUID column -> JSON column
-    - Copy-based access -> ObjectRef-based access
-    """
-    ...
+    return {'orphaned': len(orphaned), ...}
 ```
 
-### 6.2 Create Migration CLI
-
-**New commands**:
-- `dj migrate analyze <schema>` - Analyze migration needs
-- `dj migrate external <schema> <store>` - Migrate external store
-- `dj migrate blobs <schema>` - Migrate blob columns
-- `dj migrate status <schema>` - Show migration status
-
-**Dependencies**: Phase 6.1
-
 ---
 
-## Phase 7: Documentation and Testing
+## Phase 6: Migration Utilities 🔲
 
-### 7.1 Unit Tests
+**Status**: Pending
 
-**New test files:**
-- `tests/test_content_type.py` - Content-addressed storage tests
-- `tests/test_xblob.py` - XBlob type tests
-- `tests/test_attach_types.py` - Attachment type tests
-- `tests/test_filepath_new.py` - New filepath tests
-- `tests/test_gc.py` - Garbage collection tests
-- `tests/test_migration.py` - Migration utility tests
+### Key migrations needed:
+1. Legacy `~external_{store}` tables → content-addressed storage
+2. UUID-based external references → hash-based JSON metadata
+3. Legacy `filepath@store` → new `<filepath@store>` with ObjectRef
 
-**Existing test files to update:**
-- `tests/test_attribute_type.py` - Add new type tests
-- `tests/test_object.py` - Verify object type unchanged
-
-### 7.2 Integration Tests
+---
 
-**Test scenarios**:
-1. Insert/fetch roundtrip for all new types
-2. Type composition (xblob using content)
-3. Multi-schema content deduplication
-4. GC with cross-schema references
-5. Migration from legacy external stores
-6. Backward compatibility with existing schemas
+## Phase 7: Documentation and Testing 🔲
 
-### 7.3 Documentation
+**Status**: Pending
 
-**Files to update:**
-- `docs/src/design/tables/storage-types-spec.md` - Already exists
-- Create user guide for new types
-- Create migration guide
-- Update API reference
+### Test files to create:
+- `tests/test_content_storage.py` - Content-addressed storage functions
+- `tests/test_xblob.py` - XBlobType roundtrip
+- `tests/test_type_composition.py` - Type chain encoding/decoding
+- `tests/test_gc.py` - Garbage collection
 
 ---
 
-## Implementation Order and Dependencies
+## Critical Files Summary
 
-```
-Phase 1: Core Type System Foundation
-├── 1.1 Expand Core Type Mappings (no deps)
-├── 1.2 Enhance AttributeType with Store Parameter (no deps)
-└── 1.3 Update Heading and Declaration Parsing (depends on 1.2)
-
-Phase 2: Content-Addressed Storage
-├── 2.1 Create ContentRegistry Table (no deps)
-├── 2.2 Implement ContentType (depends on 2.1)
-└── 2.3 Content Storage Backend Methods (no deps)
-
-Phase 3: User-Defined AttributeTypes (depends on Phase 2)
-├── 3.1 Implement XBlobType (depends on 2.2)
-├── 3.2 Implement AttachType and XAttachType (depends on 2.2)
-└── 3.3 Implement FilepathType (no deps)
-
-Phase 4: Insert and Fetch Integration (depends on Phases 1-3)
-├── 4.1 Update Insert Processing
-├── 4.2 Update Fetch Processing
-└── 4.3 Update Heading Attribute Properties
-
-Phase 5: Garbage Collection (depends on Phase 2)
-├── 5.1 Implement GC Scanner
-└── 5.2 Add GC CLI Commands
-
-Phase 6: Migration Utilities (depends on Phases 2-4)
-├── 6.1 Enhance Migration Module
-└── 6.2 Create Migration CLI
-
-Phase 7: Documentation and Testing (ongoing)
-```
+| File | Status | Changes |
+|------|--------|---------|
+| `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation |
+| `src/datajoint/heading.py` | ✅ | Simplified attribute properties |
+| `src/datajoint/attribute_type.py` | ✅ | ContentType, XBlobType, type chain resolution |
+| `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) |
+| `src/datajoint/table.py` | ✅ | Type chain encoding on insert |
+| `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch |
+| `src/datajoint/blob.py` | ✅ | Removed bypass_serialization |
+| `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) |
+| `src/datajoint/migrate.py` | 🔲 | Migration utilities |
 
 ---
 
-## Critical Files Summary
+## Removed/Deprecated
 
-| File | Changes |
-|------|---------|
-| `src/datajoint/attribute_type.py` | All new AttributeTypes: `ContentType`, `XBlobType`, `AttachType`, `XAttachType`, `FilepathType` |
-| `src/datajoint/declare.py` | Type pattern parsing, SQL generation, `<type@store>` syntax |
-| `src/datajoint/heading.py` | Attribute metadata, composed type information |
-| `src/datajoint/table.py` | Insert logic with type composition |
-| `src/datajoint/fetch.py` | Fetch logic with type chain decoding |
-| `src/datajoint/content_registry.py` | **New**: ContentRegistry table and methods |
-| `src/datajoint/gc.py` | **New**: Garbage collection scanner |
-| `src/datajoint/migrate.py` | Migration utilities |
+- `src/datajoint/attribute_adapter.py` - Deleted (hard deprecated)
+- `bypass_serialization` flag in `blob.py` - Removed
+- `database` field in Attribute - Removed (unused)
+- `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` - Removed
+- `is_attachment`, `is_filepath`, `is_object`, `is_external` flags - Removed
 
 ---
 
-## Risk Mitigation
+## Architecture Summary
 
-### Backward Compatibility
-1. All existing types (`longblob`, `blob@store`, `attach@store`, `filepath@store`) continue to work
-2. Legacy `~external_*` tables remain functional during transition
-3. Implicit blob serialization preserved for existing schemas
-4. Migration is opt-in and reversible
+```
+Layer 3: AttributeTypes (user-facing)
+         <djblob>, <xblob>, <attach>, <xattach>, <filepath@store>
+         ↓ encode() / ↑ decode()
 
-### Performance Considerations
-1. Content hashing uses SHA256 (fast, widely supported)
-2. Deduplication reduces storage costs
-3. Lazy ObjectRef prevents unnecessary I/O
-4. GC runs on-demand, not automatically
+Layer 2: Core DataJoint Types
+         float32, int64, uuid, json, blob, varchar(n), etc.
+         ↓ SQL mapping
 
-### Error Handling
-1. Content hash verification on fetch (optional)
-2. Graceful handling of missing content
-3. Transaction safety for multi-table operations
-4. Clear error messages for misconfiguration
+Layer 1: Native Database Types
+         FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc.
+```
 
----
+**Type Composition Example:**
+```
+<xblob> → <content> → json (in DB)
 
-## Estimated Effort
-
-| Phase | Estimated Days | Risk |
-|-------|----------------|------|
-| Phase 1 | 3-4 days | Low |
-| Phase 2 | 4-5 days | Medium |
-| Phase 3 | 3-4 days | Low |
-| Phase 4 | 4-5 days | Medium |
-| Phase 5 | 2-3 days | Low |
-| Phase 6 | 3-4 days | Medium |
-| Phase 7 | 5-7 days | Low |
-| **Total** | **24-32 days** | |
+Insert: Python object → blob.pack() → put_content() → JSON metadata
+Fetch:  JSON metadata → get_content() → blob.unpack() → Python object
+```

From 70fb567afdb59fc013e67c2f0cc09323ffec1da2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:19:51 +0000
Subject: [PATCH 33/41] Move built-in AttributeTypes to separate
 builtin_types.py module

- Create builtin_types.py with DJBlobType, ContentType, XBlobType
- Types serve as examples for users creating custom types
- Module docstring includes example of defining a custom GraphType
- Add get_adapter() function to attribute_type.py for compatibility
- Auto-register built-in types via import at module load

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/attribute_type.py | 297 +++-----------------------------
 src/datajoint/builtin_types.py  | 239 +++++++++++++++++++++++++
 2 files changed, 260 insertions(+), 276 deletions(-)
 create mode 100644 src/datajoint/builtin_types.py

diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py
index 2c06ccc83..37fae88ca 100644
--- a/src/datajoint/attribute_type.py
+++ b/src/datajoint/attribute_type.py
@@ -463,290 +463,35 @@ def resolve_dtype(
     return dtype, chain, store_name
 
 
-# =============================================================================
-# Built-in Attribute Types
-# =============================================================================
-
-
-class DJBlobType(AttributeType):
-    """
-    Built-in type for DataJoint's native serialization format.
-
-    This type handles serialization of arbitrary Python objects (including NumPy arrays,
-    dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes:
-
-    - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
-    - Optional compression (zlib)
-    - Support for NumPy arrays, datetime objects, UUIDs, and nested structures
-
-    The ``<djblob>`` type is the explicit way to specify DataJoint's serialization.
-    It stores data in a MySQL ``LONGBLOB`` column.
-
-    Example:
-        @schema
-        class ProcessedData(dj.Manual):
-            definition = '''
-            data_id : int
-            ---
-            results : <djblob>      # Serialized Python objects
-            raw_bytes : longblob    # Raw bytes (no serialization)
-            '''
-
-    Note:
-        Plain ``longblob`` columns store and return raw bytes without serialization.
-        Use ``<djblob>`` when you need automatic serialization of Python objects.
-        Existing schemas using implicit blob serialization should migrate to ``<djblob>``
-        using ``dj.migrate.migrate_blob_columns()``.
-    """
-
-    type_name = "djblob"
-    dtype = "longblob"
-
-    def encode(self, value: Any, *, key: dict | None = None) -> bytes:
-        """
-        Serialize a Python object to DataJoint's blob format.
-
-        Args:
-            value: Any serializable Python object (dict, list, numpy array, etc.)
-            key: Primary key values (unused for blob serialization).
-
-        Returns:
-            Serialized bytes with protocol header and optional compression.
-        """
-        from . import blob
-
-        return blob.pack(value, compress=True)
-
-    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
-        """
-        Deserialize DataJoint blob format back to a Python object.
-
-        Args:
-            stored: Serialized blob bytes.
-            key: Primary key values (unused for blob serialization).
-
-        Returns:
-            The deserialized Python object.
-        """
-        from . import blob
-
-        return blob.unpack(stored, squeeze=False)
-
-
-class DJBlobExternalType(AttributeType):
-    """
-    Built-in type for externally-stored DataJoint blobs.
-
-    Similar to ``<djblob>`` but stores data in external blob storage instead
-    of inline in the database. Useful for large objects.
-
-    The store name is specified when defining the column type.
-
-    Example:
-        @schema
-        class LargeData(dj.Manual):
-            definition = '''
-            data_id : int
-            ---
-            large_array : blob@mystore  # External storage with auto-serialization
-            '''
+def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]:
     """
+    Get an attribute type by name.
 
-    # Note: This type isn't directly usable via <djblob_external> syntax
-    # It's used internally when blob@store syntax is detected
-    type_name = "djblob_external"
-    dtype = "blob@store"  # Placeholder - actual store is determined at declaration time
-
-    def encode(self, value: Any, *, key: dict | None = None) -> bytes:
-        """Serialize a Python object to DataJoint's blob format."""
-        from . import blob
-
-        return blob.pack(value, compress=True)
-
-    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
-        """Deserialize DataJoint blob format back to a Python object."""
-        from . import blob
-
-        return blob.unpack(stored, squeeze=False)
-
-
-class ContentType(AttributeType):
-    """
-    Built-in type for content-addressed storage with deduplication.
-
-    The ``<content>`` type stores data using content-addressed storage. Data is
-    identified by its SHA256 hash and stored in a hierarchical directory structure.
-    Duplicate content is automatically deduplicated - storing the same bytes twice
-    will only create one copy in storage.
-
-    The database column stores JSON metadata including the content hash, store name,
-    and size. The actual content is stored in external storage.
-
-    This type is primarily used as a building block for other types like ``<xblob>``
-    and ``<xattach>``, but can also be used directly for raw binary content.
-
-    Example:
-        @schema
-        class RawContent(dj.Manual):
-            definition = '''
-            content_id : int
-            ---
-            data : <content@mystore>   # Content-addressed storage
-            '''
-
-        # Insert raw bytes
-        table.insert1({'content_id': 1, 'data': b'raw binary content'})
-
-        # Fetch returns the original bytes
-        data = (table & 'content_id=1').fetch1('data')
-        assert data == b'raw binary content'
-
-    Storage Structure:
-        Content is stored at: ``_content/{hash[:2]}/{hash[2:4]}/{hash}``
-        This hierarchical structure prevents too many files in a single directory.
-
-    Note:
-        The store parameter is required for ``<content>`` unless a default store
-        is configured. Use ``<content@store_name>`` syntax to specify the store.
-    """
+    This is a compatibility function used by heading and declare modules.
 
-    type_name = "content"
-    dtype = "json"
-
-    def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
-        """
-        Store content and return metadata.
-
-        Computes the SHA256 hash of the content and stores it using content-addressed
-        storage. If content with the same hash already exists, it is not re-uploaded
-        (deduplication).
-
-        Args:
-            value: Raw bytes to store.
-            key: Primary key values (unused for content storage).
-            store_name: Store to use. If None, uses default store from config.
-
-        Returns:
-            Metadata dict with keys: hash, store, size
-
-        Raises:
-            TypeError: If value is not bytes.
-        """
-        if not isinstance(value, bytes):
-            raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
-
-        from .content_registry import put_content
-
-        return put_content(value, store_name=store_name)
-
-    def decode(self, stored: dict, *, key: dict | None = None) -> bytes:
-        """
-        Retrieve content by its hash.
-
-        Args:
-            stored: Metadata dict with 'hash' and optionally 'store' keys.
-            key: Primary key values (unused for content retrieval).
-
-        Returns:
-            The original bytes.
-
-        Raises:
-            MissingExternalFile: If content is not found.
-            DataJointError: If hash verification fails.
-        """
-        from .content_registry import get_content
-
-        content_hash = stored["hash"]
-        store_name = stored.get("store")
-        return get_content(content_hash, store_name=store_name)
-
-    def validate(self, value: Any) -> None:
-        """Validate that value is bytes."""
-        if not isinstance(value, bytes):
-            raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
-
-
-class XBlobType(AttributeType):
-    """
-    Built-in type for externally-stored serialized blobs with deduplication.
-
-    The ``<xblob>`` type combines DataJoint's blob serialization with content-addressed
-    storage. Objects are serialized using the djblob format, then stored externally
-    using content-addressed storage for automatic deduplication.
-
-    This type is ideal for large objects (NumPy arrays, pandas DataFrames, etc.)
-    that may be duplicated across multiple rows.
-
-    Example:
-        @schema
-        class LargeArrays(dj.Manual):
-            definition = '''
-            array_id : int
-            ---
-            data : <xblob@mystore>   # External serialized blob with deduplication
-            '''
-
-        # Insert NumPy array
-        import numpy as np
-        table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)})
+    Args:
+        context: Ignored (legacy parameter, kept for API compatibility).
+        adapter_name: The type name, with or without angle brackets.
+                      May include store parameter (e.g., "<xblob@cold>").
 
-        # Fetch returns the original array
-        data = (table & 'array_id=1').fetch1('data')
+    Returns:
+        Tuple of (AttributeType instance, store_name or None).
 
-    Note:
-        - For internal storage (in database), use ``<djblob>``
-        - For external storage without serialization, use ``<content>``
-        - The store parameter is required unless a default store is configured
+    Raises:
+        DataJointError: If the type is not found.
     """
+    type_name, store_name = parse_type_spec(adapter_name)
 
-    type_name = "xblob"
-    dtype = "<content>"  # Composition: uses ContentType for storage
-
-    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
-        """
-        Serialize a Python object to bytes.
-
-        The object is serialized using DataJoint's blob format. The resulting
-        bytes are then passed to the underlying ``<content>`` type for storage.
-
-        Args:
-            value: Any serializable Python object.
-            key: Primary key values (unused).
-            store_name: Store parameter (passed through to content storage).
-
-        Returns:
-            Serialized bytes (will be stored by ContentType).
-        """
-        from . import blob
-
-        return blob.pack(value, compress=True)
-
-    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
-        """
-        Deserialize bytes back to a Python object.
-
-        Args:
-            stored: Serialized bytes retrieved from content storage.
-            key: Primary key values (unused).
+    if is_type_registered(type_name):
+        return get_type(type_name), store_name
 
-        Returns:
-            The deserialized Python object.
-        """
-        from . import blob
+    raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.")
 
-        return blob.unpack(stored, squeeze=False)
-
-
-def _register_builtin_types() -> None:
-    """
-    Register DataJoint's built-in attribute types.
-
-    Called automatically during module initialization.
-    """
-    register_type(DJBlobType)
-    register_type(ContentType)
-    register_type(XBlobType)
 
+# =============================================================================
+# Auto-register built-in types
+# =============================================================================
 
-# Register built-in types when module is loaded
-_register_builtin_types()
+# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.)
+# This import has a side effect: it registers the types via @register_type decorators
+from . import builtin_types as _builtin_types  # noqa: F401, E402
diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py
new file mode 100644
index 000000000..303b84945
--- /dev/null
+++ b/src/datajoint/builtin_types.py
@@ -0,0 +1,239 @@
+"""
+Built-in DataJoint attribute types.
+
+This module defines the standard AttributeTypes that ship with DataJoint.
+These serve as both useful built-in types and as examples for users who
+want to create their own custom types.
+
+Built-in Types:
+    - ``<djblob>``: Serialize Python objects to DataJoint's blob format (internal storage)
+    - ``<content>``: Content-addressed storage with SHA256 deduplication
+    - ``<xblob>``: External serialized blobs using content-addressed storage
+
+Example - Creating a Custom Type:
+    Here's how to define your own AttributeType, modeled after the built-in types::
+
+        import datajoint as dj
+        import networkx as nx
+
+        @dj.register_type
+        class GraphType(dj.AttributeType):
+            '''Store NetworkX graphs as edge lists.'''
+
+            type_name = "graph"      # Use as <graph> in definitions
+            dtype = "<djblob>"       # Compose with djblob for serialization
+
+            def encode(self, graph, *, key=None, store_name=None):
+                # Convert graph to a serializable format
+                return {
+                    'nodes': list(graph.nodes(data=True)),
+                    'edges': list(graph.edges(data=True)),
+                }
+
+            def decode(self, stored, *, key=None):
+                # Reconstruct graph from stored format
+                G = nx.Graph()
+                G.add_nodes_from(stored['nodes'])
+                G.add_edges_from(stored['edges'])
+                return G
+
+            def validate(self, value):
+                if not isinstance(value, nx.Graph):
+                    raise TypeError(f"Expected nx.Graph, got {type(value).__name__}")
+
+        # Now use in table definitions:
+        @schema
+        class Networks(dj.Manual):
+            definition = '''
+            network_id : int
+            ---
+            topology : <graph>
+            '''
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .attribute_type import AttributeType, register_type
+
+
+# =============================================================================
+# DJBlob Types - DataJoint's native serialization
+# =============================================================================
+
+
+@register_type
+class DJBlobType(AttributeType):
+    """
+    Serialize Python objects using DataJoint's blob format.
+
+    The ``<djblob>`` type handles serialization of arbitrary Python objects
+    including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs.
+    Data is stored in a MySQL ``LONGBLOB`` column.
+
+    Format Features:
+        - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
+        - Optional zlib compression for data > 1KB
+        - Support for nested structures
+
+    Example::
+
+        @schema
+        class ProcessedData(dj.Manual):
+            definition = '''
+            data_id : int
+            ---
+            results : <djblob>      # Serialized Python objects
+            '''
+
+        # Insert any serializable object
+        table.insert1({'data_id': 1, 'results': {'scores': [0.9, 0.8], 'labels': ['a', 'b']}})
+
+    Note:
+        Plain ``longblob`` columns store raw bytes without serialization.
+        Use ``<djblob>`` when you need automatic serialization.
+    """
+
+    type_name = "djblob"
+    dtype = "longblob"
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """Serialize a Python object to DataJoint's blob format."""
+        from . import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """Deserialize blob bytes back to a Python object."""
+        from . import blob
+
+        return blob.unpack(stored, squeeze=False)
+
+
+# =============================================================================
+# Content-Addressed Storage Types
+# =============================================================================
+
+
+@register_type
+class ContentType(AttributeType):
+    """
+    Content-addressed storage with SHA256 deduplication.
+
+    The ``<content>`` type stores raw bytes using content-addressed storage.
+    Data is identified by its SHA256 hash and stored in a hierarchical directory:
+    ``_content/{hash[:2]}/{hash[2:4]}/{hash}``
+
+    The database column stores JSON metadata: ``{hash, store, size}``.
+    Duplicate content is automatically deduplicated.
+
+    Example::
+
+        @schema
+        class RawContent(dj.Manual):
+            definition = '''
+            content_id : int
+            ---
+            data : <content@mystore>
+            '''
+
+        # Insert raw bytes
+        table.insert1({'content_id': 1, 'data': b'raw binary content'})
+
+    Note:
+        This type accepts only ``bytes``. For Python objects, use ``<xblob>``.
+        A store must be specified (e.g., ``<content@store>``) unless a default
+        store is configured.
+    """
+
+    type_name = "content"
+    dtype = "json"
+
+    def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
+        """
+        Store content and return metadata.
+
+        Args:
+            value: Raw bytes to store.
+            key: Primary key values (unused).
+            store_name: Store to use. If None, uses default store.
+
+        Returns:
+            Metadata dict: {hash, store, size}
+        """
+        from .content_registry import put_content
+
+        return put_content(value, store_name=store_name)
+
+    def decode(self, stored: dict, *, key: dict | None = None) -> bytes:
+        """
+        Retrieve content by hash.
+
+        Args:
+            stored: Metadata dict with 'hash' and optionally 'store'.
+            key: Primary key values (unused).
+
+        Returns:
+            Original bytes.
+        """
+        from .content_registry import get_content
+
+        return get_content(stored["hash"], store_name=stored.get("store"))
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is bytes."""
+        if not isinstance(value, bytes):
+            raise TypeError(f"<content> expects bytes, got {type(value).__name__}")
+
+
+@register_type
+class XBlobType(AttributeType):
+    """
+    External serialized blobs with content-addressed storage.
+
+    The ``<xblob>`` type combines DataJoint's blob serialization with
+    content-addressed storage. Objects are serialized, then stored externally
+    with automatic deduplication.
+
+    This is ideal for large objects (NumPy arrays, DataFrames) that may be
+    duplicated across rows.
+
+    Example::
+
+        @schema
+        class LargeArrays(dj.Manual):
+            definition = '''
+            array_id : int
+            ---
+            data : <xblob@mystore>
+            '''
+
+        import numpy as np
+        table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)})
+
+    Type Composition:
+        ``<xblob>`` composes with ``<content>``::
+
+            Insert: object → blob.pack() → put_content() → JSON metadata
+            Fetch:  JSON → get_content() → blob.unpack() → object
+
+    Note:
+        - For internal storage, use ``<djblob>``
+        - For raw bytes without serialization, use ``<content>``
+    """
+
+    type_name = "xblob"
+    dtype = "<content>"  # Composition: uses ContentType
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """Serialize object to bytes (passed to ContentType)."""
+        from . import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """Deserialize bytes back to Python object."""
+        from . import blob
+
+        return blob.unpack(stored, squeeze=False)

From ad09877dbf149a83aeff9403a2974bc8172cacd7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:25:09 +0000
Subject: [PATCH 34/41] Implement ObjectType for path-addressed storage

Add <object> type for files and folders (Zarr, HDF5, etc.):
- Path derived from primary key: {schema}/{table}/objects/{pk}/{field}_{token}
- Supports bytes, files, and directories
- Returns ObjectRef for lazy fsspec-based access
- No deduplication (unlike <content>)

Update implementation plan with Phase 2b documenting ObjectType.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      |  70 ++++++-
 src/datajoint/builtin_types.py                | 190 ++++++++++++++++++
 2 files changed, 257 insertions(+), 3 deletions(-)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index a425837eb..22845c4ca 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -18,7 +18,8 @@ This plan describes the implementation of a three-layer type architecture for Da
 |-------|--------|-------|
 | Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution |
 | Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table |
-| Phase 3: User-Defined AttributeTypes | 🔲 Pending | XBlobType done, AttachType/FilepathType pending |
+| Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders |
+| Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending |
 | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding |
 | Phase 5: Garbage Collection | 🔲 Pending | |
 | Phase 6: Migration Utilities | 🔲 Pending | |
@@ -143,6 +144,58 @@ class XBlobType(AttributeType):
 
 ---
 
+## Phase 2b: Path-Addressed Storage (ObjectType) ✅
+
+**Status**: Complete
+
+### Design: Path vs Content Addressing
+
+| Aspect | `<content>` | `<object>` |
+|--------|-------------|------------|
+| Addressing | Content-hash (SHA256) | Path (from primary key) |
+| Path Format | `_content/{hash[:2]}/{hash[2:4]}/{hash}` | `{schema}/{table}/objects/{pk}/{field}_{token}.ext` |
+| Deduplication | Yes (same content = same hash) | No (each row has unique path) |
+| Deletion | GC when unreferenced | Deleted with row |
+| Use case | Serialized blobs, attachments | Zarr, HDF5, folders |
+
+### Implemented in `src/datajoint/builtin_types.py`:
+
+```python
+@register_type
+class ObjectType(AttributeType):
+    """Path-addressed storage for files and folders."""
+    type_name = "object"
+    dtype = "json"
+
+    def encode(self, value, *, key=None, store_name=None) -> dict:
+        # value can be bytes, str path, or Path
+        # key contains _schema, _table, _field for path construction
+        path, token = build_object_path(schema, table, field, primary_key, ext)
+        backend.put_buffer(content, path)  # or put_folder for directories
+        return {
+            "path": path,
+            "store": store_name,
+            "size": size,
+            "ext": ext,
+            "is_dir": is_dir,
+            "timestamp": timestamp.isoformat(),
+        }
+
+    def decode(self, stored: dict, *, key=None) -> ObjectRef:
+        # Returns lazy handle for fsspec-based access
+        return ObjectRef.from_json(stored, backend=backend)
+```
+
+### ObjectRef Features:
+- `ref.path` - Storage path
+- `ref.read()` - Read file content
+- `ref.open()` - Open as file handle
+- `ref.fsmap` - For `zarr.open(ref.fsmap)`
+- `ref.download(dest)` - Download to local path
+- `ref.listdir()` / `ref.walk()` - For directories
+
+---
+
 ## Phase 3: User-Defined AttributeTypes
 
 **Status**: Partially complete
@@ -319,8 +372,11 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 |------|--------|---------|
 | `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation |
 | `src/datajoint/heading.py` | ✅ | Simplified attribute properties |
-| `src/datajoint/attribute_type.py` | ✅ | ContentType, XBlobType, type chain resolution |
+| `src/datajoint/attribute_type.py` | ✅ | Base class, registry, type chain resolution |
+| `src/datajoint/builtin_types.py` | ✅ | DJBlobType, ContentType, XBlobType, ObjectType |
 | `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) |
+| `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access |
+| `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path |
 | `src/datajoint/table.py` | ✅ | Type chain encoding on insert |
 | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch |
 | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization |
@@ -343,7 +399,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 
 ```
 Layer 3: AttributeTypes (user-facing)
-         <djblob>, <xblob>, <attach>, <xattach>, <filepath@store>
+         <djblob>, <object>, <content>, <xblob>, <attach>, <xattach>, <filepath@store>
          ↓ encode() / ↑ decode()
 
 Layer 2: Core DataJoint Types
@@ -354,6 +410,14 @@ Layer 1: Native Database Types
          FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc.
 ```
 
+**Built-in AttributeTypes:**
+```
+<djblob>   → longblob (internal serialized storage)
+<object>   → json     (path-addressed, for Zarr/HDF5/folders)
+<content>  → json     (content-addressed with deduplication)
+<xblob>    → <content> → json (external serialized with dedup)
+```
+
 **Type Composition Example:**
 ```
 <xblob> → <content> → json (in DB)
diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py
index 303b84945..27d5d872f 100644
--- a/src/datajoint/builtin_types.py
+++ b/src/datajoint/builtin_types.py
@@ -9,6 +9,7 @@
     - ``<djblob>``: Serialize Python objects to DataJoint's blob format (internal storage)
     - ``<content>``: Content-addressed storage with SHA256 deduplication
     - ``<xblob>``: External serialized blobs using content-addressed storage
+    - ``<object>``: Path-addressed storage for files/folders (Zarr, HDF5)
 
 Example - Creating a Custom Type:
     Here's how to define your own AttributeType, modeled after the built-in types::
@@ -237,3 +238,192 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
         from . import blob
 
         return blob.unpack(stored, squeeze=False)
+
+
+# =============================================================================
+# Path-Addressed Storage Types (OAS - Object-Augmented Schema)
+# =============================================================================
+
+
+@register_type
+class ObjectType(AttributeType):
+    """
+    Path-addressed storage for files and folders.
+
+    The ``<object>`` type provides managed file/folder storage where the path
+    is derived from the primary key: ``{schema}/{table}/objects/{pk}/{field}_{token}.{ext}``
+
+    Unlike ``<content>`` (content-addressed), each row has its own storage path,
+    and content is deleted when the row is deleted. This is ideal for:
+
+    - Zarr arrays (hierarchical chunked data)
+    - HDF5 files
+    - Complex multi-file outputs
+    - Any content that shouldn't be deduplicated
+
+    Example::
+
+        @schema
+        class Analysis(dj.Computed):
+            definition = '''
+            -> Recording
+            ---
+            results : <object@mystore>
+            '''
+
+        def make(self, key):
+            # Store a file
+            self.insert1({**key, 'results': '/path/to/results.zarr'})
+
+        # Fetch returns ObjectRef for lazy access
+        ref = (Analysis & key).fetch1('results')
+        ref.path       # Storage path
+        ref.read()     # Read file content
+        ref.fsmap      # For zarr.open(ref.fsmap)
+
+    Storage Structure:
+        Objects are stored at::
+
+            {store_root}/{schema}/{table}/objects/{pk}/{field}_{token}.ext
+
+        The token ensures uniqueness even if content is replaced.
+
+    Comparison with ``<content>``::
+
+        | Aspect         | <object>          | <content>           |
+        |----------------|-------------------|---------------------|
+        | Addressing     | Path (by PK)      | Hash (by content)   |
+        | Deduplication  | No                | Yes                 |
+        | Deletion       | With row          | GC when unreferenced|
+        | Use case       | Zarr, HDF5        | Blobs, attachments  |
+
+    Note:
+        A store must be specified (``<object@store>``) unless a default store
+        is configured. Returns ``ObjectRef`` on fetch for lazy access.
+    """
+
+    type_name = "object"
+    dtype = "json"
+
+    def encode(
+        self,
+        value: Any,
+        *,
+        key: dict | None = None,
+        store_name: str | None = None,
+    ) -> dict:
+        """
+        Store content and return metadata.
+
+        Args:
+            value: Content to store. Can be:
+                - bytes: Raw bytes to store as file
+                - str/Path: Path to local file or folder to upload
+            key: Dict containing context for path construction:
+                - _schema: Schema name
+                - _table: Table name
+                - _field: Field/attribute name
+                - Other entries are primary key values
+            store_name: Store to use. If None, uses default store.
+
+        Returns:
+            Metadata dict suitable for ObjectRef.from_json()
+        """
+        from datetime import datetime, timezone
+        from pathlib import Path
+
+        from .content_registry import get_store_backend
+        from .storage import build_object_path
+
+        # Extract context from key
+        key = key or {}
+        schema = key.pop("_schema", "unknown")
+        table = key.pop("_table", "unknown")
+        field = key.pop("_field", "data")
+        primary_key = {k: v for k, v in key.items() if not k.startswith("_")}
+
+        # Determine content type and extension
+        is_dir = False
+        ext = None
+        size = None
+
+        if isinstance(value, bytes):
+            content = value
+            size = len(content)
+        elif isinstance(value, (str, Path)):
+            source_path = Path(value)
+            if not source_path.exists():
+                raise FileNotFoundError(f"Source path does not exist: {source_path}")
+            is_dir = source_path.is_dir()
+            ext = source_path.suffix if not is_dir else None
+            if is_dir:
+                # For directories, we'll upload later
+                content = None
+            else:
+                content = source_path.read_bytes()
+                size = len(content)
+        else:
+            raise TypeError(f"<object> expects bytes or path, got {type(value).__name__}")
+
+        # Build storage path
+        path, token = build_object_path(
+            schema=schema,
+            table=table,
+            field=field,
+            primary_key=primary_key,
+            ext=ext,
+        )
+
+        # Get storage backend
+        backend = get_store_backend(store_name)
+
+        # Upload content
+        if is_dir:
+            # Upload directory recursively
+            source_path = Path(value)
+            backend.put_folder(str(source_path), path)
+            # Compute size by summing all files
+            size = sum(f.stat().st_size for f in source_path.rglob("*") if f.is_file())
+        else:
+            backend.put_buffer(content, path)
+
+        # Build metadata
+        timestamp = datetime.now(timezone.utc)
+        metadata = {
+            "path": path,
+            "store": store_name,
+            "size": size,
+            "ext": ext,
+            "is_dir": is_dir,
+            "timestamp": timestamp.isoformat(),
+        }
+
+        return metadata
+
+    def decode(self, stored: dict, *, key: dict | None = None) -> Any:
+        """
+        Create ObjectRef handle for lazy access.
+
+        Args:
+            stored: Metadata dict from database.
+            key: Primary key values (unused).
+
+        Returns:
+            ObjectRef for accessing the stored content.
+        """
+        from .content_registry import get_store_backend
+        from .objectref import ObjectRef
+
+        store_name = stored.get("store")
+        backend = get_store_backend(store_name)
+        return ObjectRef.from_json(stored, backend=backend)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is bytes or a valid path."""
+        from pathlib import Path
+
+        if isinstance(value, bytes):
+            return
+        if isinstance(value, (str, Path)):
+            return
+        raise TypeError(f"<object> expects bytes or path, got {type(value).__name__}")

From dd8c623ae15561b8483f1417f72c41ccbe64dc64 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:25:57 +0000
Subject: [PATCH 35/41] Remove migration phase from implementation plan

Migration utilities are out of scope for now. This is a breaking
change version - users will need to recreate tables with new types.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../tables/storage-types-implementation-plan.md | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index 22845c4ca..65567d03d 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -22,8 +22,7 @@ This plan describes the implementation of a three-layer type architecture for Da
 | Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending |
 | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding |
 | Phase 5: Garbage Collection | 🔲 Pending | |
-| Phase 6: Migration Utilities | 🔲 Pending | |
-| Phase 7: Documentation and Testing | 🔲 Pending | |
+| Phase 6: Documentation and Testing | 🔲 Pending | |
 
 ---
 
@@ -343,18 +342,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 
 ---
 
-## Phase 6: Migration Utilities 🔲
-
-**Status**: Pending
-
-### Key migrations needed:
-1. Legacy `~external_{store}` tables → content-addressed storage
-2. UUID-based external references → hash-based JSON metadata
-3. Legacy `filepath@store` → new `<filepath@store>` with ObjectRef
-
----
-
-## Phase 7: Documentation and Testing 🔲
+## Phase 6: Documentation and Testing 🔲
 
 **Status**: Pending
 
@@ -381,7 +369,6 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch |
 | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization |
 | `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) |
-| `src/datajoint/migrate.py` | 🔲 | Migration utilities |
 
 ---
 

From e1b3be11b92347d78fb274ab464ffdc895368d18 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:43:55 +0000
Subject: [PATCH 36/41] Add staged insert documentation to implementation plan

- Document staged_insert.py for direct object storage writes
- Add flow comparison: normal insert vs staged insert
- Include staged_insert.py in critical files summary

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index 65567d03d..cbdfb860e 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -193,6 +193,38 @@ class ObjectType(AttributeType):
 - `ref.download(dest)` - Download to local path
 - `ref.listdir()` / `ref.walk()` - For directories
 
+### Staged Insert for Object Types
+
+For large objects like Zarr arrays, `staged_insert.py` provides direct writes to storage:
+
+```python
+with table.staged_insert1 as staged:
+    # 1. Set primary key first (required for path construction)
+    staged.rec['subject_id'] = 123
+    staged.rec['session_id'] = 45
+
+    # 2. Get storage handle and write directly
+    z = zarr.open(staged.store('raw_data', '.zarr'), mode='w')
+    z[:] = large_array
+
+    # 3. On exit: metadata computed, record inserted
+```
+
+**Flow comparison:**
+
+| Normal Insert | Staged Insert |
+|--------------|---------------|
+| `ObjectType.encode()` uploads content | Direct writes via `staged.store()` |
+| Single operation | Two-phase: write then finalize |
+| Good for files/folders | Ideal for Zarr, HDF5, streaming |
+
+Both produce the same JSON metadata format compatible with `ObjectRef.from_json()`.
+
+**Key methods:**
+- `staged.store(field, ext)` - Returns `FSMap` for Zarr/xarray
+- `staged.open(field, ext)` - Returns file handle for binary writes
+- `staged.fs` - Raw fsspec filesystem access
+
 ---
 
 ## Phase 3: User-Defined AttributeTypes
@@ -365,6 +397,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 | `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) |
 | `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access |
 | `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path |
+| `src/datajoint/staged_insert.py` | ✅ | Staged insert for direct object storage writes |
 | `src/datajoint/table.py` | ✅ | Type chain encoding on insert |
 | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch |
 | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization |

From ca0b9149fe831f9317204ee75f3d68bef51e6bef Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 22:56:33 +0000
Subject: [PATCH 37/41] Implement Phase 3: AttachType, XAttachType,
 FilepathType

Add remaining built-in AttributeTypes:
- <attach>: Internal file attachment stored in longblob
- <xattach>: External file attachment via <content> with deduplication
- <filepath@store>: Reference to existing file (no copy, returns ObjectRef)

Update implementation plan to mark Phase 3 complete.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      |  53 ++-
 src/datajoint/builtin_types.py                | 317 ++++++++++++++++++
 2 files changed, 352 insertions(+), 18 deletions(-)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index cbdfb860e..6d6d2979b 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -19,7 +19,7 @@ This plan describes the implementation of a three-layer type architecture for Da
 | Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution |
 | Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table |
 | Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders |
-| Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending |
+| Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType |
 | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding |
 | Phase 5: Garbage Collection | 🔲 Pending | |
 | Phase 6: Documentation and Testing | 🔲 Pending | |
@@ -227,14 +227,16 @@ Both produce the same JSON metadata format compatible with `ObjectRef.from_json(
 
 ---
 
-## Phase 3: User-Defined AttributeTypes
+## Phase 3: User-Defined AttributeTypes ✅
 
-**Status**: Partially complete
+**Status**: Complete
+
+All built-in AttributeTypes are implemented in `src/datajoint/builtin_types.py`.
 
 ### 3.1 XBlobType ✅
-Implemented as shown above. Composes with `<content>`.
+External serialized blobs using content-addressed storage. Composes with `<content>`.
 
-### 3.2 AttachType and XAttachType 🔲
+### 3.2 AttachType ✅
 
 ```python
 @register_type
@@ -243,41 +245,53 @@ class AttachType(AttributeType):
     type_name = "attach"
     dtype = "longblob"
 
-    def encode(self, filepath, *, key=None) -> bytes:
-        path = Path(filepath)
-        return path.name.encode() + b"\0" + path.read_bytes()
+    def encode(self, filepath, *, key=None, store_name=None) -> bytes:
+        # Returns: filename (UTF-8) + null byte + contents
+        return path.name.encode("utf-8") + b"\x00" + path.read_bytes()
 
     def decode(self, stored, *, key=None) -> str:
-        filename, contents = stored.split(b"\0", 1)
-        # Write to download_path and return path
+        # Extracts to download_path, returns local path
         ...
+```
+
+### 3.3 XAttachType ✅
 
+```python
 @register_type
 class XAttachType(AttributeType):
     """External file attachment using content-addressed storage."""
     type_name = "xattach"
-    dtype = "<content>"
-    # Similar to AttachType but composes with content storage
+    dtype = "<content>"  # Composes with ContentType
+    # Same encode/decode as AttachType, but stored externally with dedup
 ```
 
-### 3.3 FilepathType 🔲
+### 3.4 FilepathType ✅
 
 ```python
 @register_type
 class FilepathType(AttributeType):
-    """Portable relative path reference within configured stores."""
+    """Reference to existing file in configured store."""
     type_name = "filepath"
     dtype = "json"
 
     def encode(self, relative_path: str, *, key=None, store_name=None) -> dict:
-        """Register reference to file in store."""
-        return {'path': relative_path, 'store': store_name}
+        # Verifies file exists, returns metadata
+        return {'path': path, 'store': store_name, 'size': size, ...}
 
     def decode(self, stored: dict, *, key=None) -> ObjectRef:
-        """Return ObjectRef for lazy access."""
-        return ObjectRef(store=stored['store'], path=stored['path'])
+        # Returns ObjectRef for lazy access
+        return ObjectRef.from_json(stored, backend=backend)
 ```
 
+### Type Comparison
+
+| Type | Storage | Copies File | Dedup | Returns |
+|------|---------|-------------|-------|---------|
+| `<attach>` | Database | Yes | No | Local path |
+| `<xattach>` | External | Yes | Yes | Local path |
+| `<filepath>` | Reference | No | N/A | ObjectRef |
+| `<object>` | External | Yes | No | ObjectRef |
+
 ---
 
 ## Phase 4: Insert and Fetch Integration ✅
@@ -433,9 +447,12 @@ Layer 1: Native Database Types
 **Built-in AttributeTypes:**
 ```
 <djblob>   → longblob (internal serialized storage)
+<attach>   → longblob (internal file attachment)
 <object>   → json     (path-addressed, for Zarr/HDF5/folders)
+<filepath> → json     (reference to existing file in store)
 <content>  → json     (content-addressed with deduplication)
 <xblob>    → <content> → json (external serialized with dedup)
+<xattach>  → <content> → json (external file attachment with dedup)
 ```
 
 **Type Composition Example:**
diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py
index 27d5d872f..bb2bb20a6 100644
--- a/src/datajoint/builtin_types.py
+++ b/src/datajoint/builtin_types.py
@@ -10,6 +10,9 @@
     - ``<content>``: Content-addressed storage with SHA256 deduplication
     - ``<xblob>``: External serialized blobs using content-addressed storage
     - ``<object>``: Path-addressed storage for files/folders (Zarr, HDF5)
+    - ``<attach>``: Internal file attachment stored in database
+    - ``<xattach>``: External file attachment with deduplication
+    - ``<filepath@store>``: Reference to existing file in store
 
 Example - Creating a Custom Type:
     Here's how to define your own AttributeType, modeled after the built-in types::
@@ -427,3 +430,317 @@ def validate(self, value: Any) -> None:
         if isinstance(value, (str, Path)):
             return
         raise TypeError(f"<object> expects bytes or path, got {type(value).__name__}")
+
+
+# =============================================================================
+# File Attachment Types
+# =============================================================================
+
+
+@register_type
+class AttachType(AttributeType):
+    """
+    Internal file attachment stored in database.
+
+    The ``<attach>`` type stores a file directly in the database as a ``LONGBLOB``.
+    The filename is preserved and the file is extracted to the configured
+    download path on fetch.
+
+    Example::
+
+        @schema
+        class Documents(dj.Manual):
+            definition = '''
+            doc_id : int
+            ---
+            report : <attach>
+            '''
+
+        # Insert a file
+        table.insert1({'doc_id': 1, 'report': '/path/to/report.pdf'})
+
+        # Fetch extracts to download_path and returns local path
+        local_path = (table & 'doc_id=1').fetch1('report')
+
+    Storage Format:
+        The blob contains: ``filename\\0contents``
+        - Filename (UTF-8 encoded) + null byte + raw file contents
+
+    Note:
+        - For large files, use ``<xattach>`` (external storage with deduplication)
+        - For files that shouldn't be copied, use ``<filepath@store>``
+    """
+
+    type_name = "attach"
+    dtype = "longblob"
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """
+        Read file and encode as filename + contents.
+
+        Args:
+            value: Path to file (str or Path).
+            key: Primary key values (unused).
+            store_name: Unused for internal storage.
+
+        Returns:
+            Bytes: filename (UTF-8) + null byte + file contents
+        """
+        from pathlib import Path
+
+        path = Path(value)
+        if not path.exists():
+            raise FileNotFoundError(f"Attachment file not found: {path}")
+        if path.is_dir():
+            raise IsADirectoryError(f"<attach> does not support directories: {path}")
+
+        filename = path.name
+        contents = path.read_bytes()
+        return filename.encode("utf-8") + b"\x00" + contents
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> str:
+        """
+        Extract file to download path and return local path.
+
+        Args:
+            stored: Blob containing filename + null + contents.
+            key: Primary key values (unused).
+
+        Returns:
+            Path to extracted file as string.
+        """
+        from pathlib import Path
+
+        from .settings import config
+
+        # Split on first null byte
+        null_pos = stored.index(b"\x00")
+        filename = stored[:null_pos].decode("utf-8")
+        contents = stored[null_pos + 1 :]
+
+        # Write to download path
+        download_path = Path(config.get("download_path", "."))
+        download_path.mkdir(parents=True, exist_ok=True)
+        local_path = download_path / filename
+
+        local_path.write_bytes(contents)
+        return str(local_path)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is a valid file path."""
+        from pathlib import Path
+
+        if not isinstance(value, (str, Path)):
+            raise TypeError(f"<attach> expects a file path, got {type(value).__name__}")
+
+
+@register_type
+class XAttachType(AttributeType):
+    """
+    External file attachment with content-addressed storage.
+
+    The ``<xattach>`` type stores files externally using content-addressed
+    storage. Like ``<attach>``, the filename is preserved and the file is
+    extracted on fetch. Unlike ``<attach>``, files are stored externally
+    with automatic deduplication.
+
+    Example::
+
+        @schema
+        class LargeDocuments(dj.Manual):
+            definition = '''
+            doc_id : int
+            ---
+            dataset : <xattach@mystore>
+            '''
+
+        # Insert a large file
+        table.insert1({'doc_id': 1, 'dataset': '/path/to/large_file.h5'})
+
+        # Fetch downloads and returns local path
+        local_path = (table & 'doc_id=1').fetch1('dataset')
+
+    Type Composition:
+        ``<xattach>`` composes with ``<content>``::
+
+            Insert: file → read + encode filename → put_content() → JSON
+            Fetch:  JSON → get_content() → extract → local path
+
+    Comparison::
+
+        | Type       | Storage  | Deduplication | Best for           |
+        |------------|----------|---------------|---------------------|
+        | <attach>   | Database | No            | Small files (<16MB) |
+        | <xattach>  | External | Yes           | Large files         |
+    """
+
+    type_name = "xattach"
+    dtype = "<content>"  # Composition: uses ContentType
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """
+        Read file and encode as filename + contents.
+
+        Args:
+            value: Path to file (str or Path).
+            key: Primary key values (unused).
+            store_name: Passed to ContentType for storage.
+
+        Returns:
+            Bytes: filename (UTF-8) + null byte + file contents
+        """
+        from pathlib import Path
+
+        path = Path(value)
+        if not path.exists():
+            raise FileNotFoundError(f"Attachment file not found: {path}")
+        if path.is_dir():
+            raise IsADirectoryError(f"<xattach> does not support directories: {path}")
+
+        filename = path.name
+        contents = path.read_bytes()
+        return filename.encode("utf-8") + b"\x00" + contents
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> str:
+        """
+        Extract file to download path and return local path.
+
+        Args:
+            stored: Bytes containing filename + null + contents.
+            key: Primary key values (unused).
+
+        Returns:
+            Path to extracted file as string.
+        """
+        from pathlib import Path
+
+        from .settings import config
+
+        # Split on first null byte
+        null_pos = stored.index(b"\x00")
+        filename = stored[:null_pos].decode("utf-8")
+        contents = stored[null_pos + 1 :]
+
+        # Write to download path
+        download_path = Path(config.get("download_path", "."))
+        download_path.mkdir(parents=True, exist_ok=True)
+        local_path = download_path / filename
+
+        local_path.write_bytes(contents)
+        return str(local_path)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is a valid file path."""
+        from pathlib import Path
+
+        if not isinstance(value, (str, Path)):
+            raise TypeError(f"<xattach> expects a file path, got {type(value).__name__}")
+
+
+# =============================================================================
+# Filepath Reference Type
+# =============================================================================
+
+
+@register_type
+class FilepathType(AttributeType):
+    """
+    Reference to existing file in configured store.
+
+    The ``<filepath@store>`` type stores a reference to a file that already
+    exists in the storage backend. Unlike ``<attach>`` or ``<object>``, no
+    file copying occurs - only the path is recorded.
+
+    This is useful when:
+    - Files are managed externally (e.g., by acquisition software)
+    - Files are too large to copy
+    - You want to reference shared datasets
+
+    Example::
+
+        @schema
+        class Recordings(dj.Manual):
+            definition = '''
+            recording_id : int
+            ---
+            raw_data : <filepath@acquisition>
+            '''
+
+        # Reference an existing file (no copy)
+        table.insert1({'recording_id': 1, 'raw_data': 'subject01/session001/data.bin'})
+
+        # Fetch returns ObjectRef for lazy access
+        ref = (table & 'recording_id=1').fetch1('raw_data')
+        ref.read()      # Read file content
+        ref.download()  # Download to local path
+
+    Storage Format:
+        JSON metadata: ``{path, store}``
+
+    Warning:
+        The file must exist in the store at the specified path.
+        DataJoint does not manage the lifecycle of referenced files.
+    """
+
+    type_name = "filepath"
+    dtype = "json"
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict:
+        """
+        Store path reference as JSON metadata.
+
+        Args:
+            value: Relative path within the store (str).
+            key: Primary key values (unused).
+            store_name: Store where the file exists.
+
+        Returns:
+            Metadata dict: {path, store}
+        """
+        from datetime import datetime, timezone
+
+        from .content_registry import get_store_backend
+
+        path = str(value)
+
+        # Optionally verify file exists
+        backend = get_store_backend(store_name)
+        if not backend.exists(path):
+            raise FileNotFoundError(f"File not found in store '{store_name or 'default'}': {path}")
+
+        # Get file info
+        try:
+            size = backend.size(path)
+        except Exception:
+            size = None
+
+        return {
+            "path": path,
+            "store": store_name,
+            "size": size,
+            "is_dir": False,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+        }
+
+    def decode(self, stored: dict, *, key: dict | None = None) -> Any:
+        """
+        Create ObjectRef handle for lazy access.
+
+        Args:
+            stored: Metadata dict with path and store.
+            key: Primary key values (unused).
+
+        Returns:
+            ObjectRef for accessing the file.
+        """
+        from .content_registry import get_store_backend
+        from .objectref import ObjectRef
+
+        store_name = stored.get("store")
+        backend = get_store_backend(store_name)
+        return ObjectRef.from_json(stored, backend=backend)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is a path string."""
+        if not isinstance(value, str):
+            raise TypeError(f"<filepath> expects a path string, got {type(value).__name__}")

From d0f56144a38c2b79aaca79b6059400a6703784e5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 25 Dec 2025 23:14:20 +0000
Subject: [PATCH 38/41] Implement Phase 5 (GC) and Phase 6 (Tests)

Add garbage collection module (gc.py) for content-addressed storage:
- scan_references() to find content hashes in schemas
- list_stored_content() to enumerate _content/ directory
- scan() for orphan detection without deletion
- collect() for orphan removal with dry_run option
- format_stats() for human-readable output

Add test files:
- test_content_storage.py for content_registry.py functions
- test_type_composition.py for type chain encoding/decoding
- test_gc.py for garbage collection

Update implementation plan to mark all phases complete.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      |  81 ++--
 src/datajoint/gc.py                           | 327 ++++++++++++++++
 tests/test_content_storage.py                 | 231 ++++++++++++
 tests/test_gc.py                              | 214 +++++++++++
 tests/test_type_composition.py                | 352 ++++++++++++++++++
 5 files changed, 1158 insertions(+), 47 deletions(-)
 create mode 100644 src/datajoint/gc.py
 create mode 100644 tests/test_content_storage.py
 create mode 100644 tests/test_gc.py
 create mode 100644 tests/test_type_composition.py

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index 6d6d2979b..8ce582f57 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -21,8 +21,8 @@ This plan describes the implementation of a three-layer type architecture for Da
 | Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders |
 | Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType |
 | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding |
-| Phase 5: Garbage Collection | 🔲 Pending | |
-| Phase 6: Documentation and Testing | 🔲 Pending | |
+| Phase 5: Garbage Collection | ✅ Complete | gc.py with scan/collect functions |
+| Phase 6: Documentation and Testing | ✅ Complete | Test files for all new types |
 
 ---
 
@@ -337,66 +337,50 @@ def _get(connection, attr, data, squeeze, download_path):
 
 ---
 
-## Phase 5: Garbage Collection 🔲
+## Phase 5: Garbage Collection ✅
 
-**Status**: Pending
-
-### Design (updated for function-based approach):
+**Status**: Complete
 
-Since we don't have a registry table, GC works by scanning:
+### Implemented in `src/datajoint/gc.py`:
 
 ```python
-def scan_content_references(schemas: list) -> set[tuple[str, str]]:
-    """
-    Scan all schemas for content references.
-
-    Returns:
-        Set of (content_hash, store) tuples that are referenced
-    """
-    referenced = set()
-    for schema in schemas:
-        for table in schema.tables:
-            for attr in table.heading.attributes:
-                if uses_content_storage(attr):
-                    # Fetch all JSON metadata from this column
-                    for row in table.fetch(attr.name):
-                        if isinstance(row, dict) and 'hash' in row:
-                            referenced.add((row['hash'], row.get('store')))
-    return referenced
-
-def list_stored_content(store_name: str) -> set[str]:
-    """List all content hashes in a store by scanning _content/ directory."""
-    ...
-
-def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
-    """
-    Remove unreferenced content from storage.
+import datajoint as dj
 
-    Returns:
-        Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B}
-    """
-    referenced = scan_content_references(schemas)
-    stored = list_stored_content(store_name)
-    orphaned = stored - {h for h, s in referenced if s == store_name}
+# Scan schemas and find orphaned content
+stats = dj.gc.scan(schema1, schema2, store_name='mystore')
 
-    if not dry_run:
-        for content_hash in orphaned:
-            delete_content(content_hash, store_name)
+# Remove orphaned content (dry_run=False to actually delete)
+stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True)
 
-    return {'orphaned': len(orphaned), ...}
+# Format statistics for display
+print(dj.gc.format_stats(stats))
 ```
 
+**Key functions:**
+- `scan_references(*schemas, store_name=None)` - Scan tables for content hashes
+- `list_stored_content(store_name=None)` - List all content in `_content/` directory
+- `scan(*schemas, store_name=None)` - Find orphaned content without deleting
+- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content
+- `format_stats(stats)` - Human-readable statistics output
+
+**GC Process:**
+1. Scan all tables in provided schemas for content-type attributes
+2. Extract content hashes from JSON metadata in those columns
+3. Scan storage `_content/` directory for all stored hashes
+4. Compute orphaned = stored - referenced
+5. Optionally delete orphaned content (when `dry_run=False`)
+
 ---
 
-## Phase 6: Documentation and Testing 🔲
+## Phase 6: Documentation and Testing ✅
 
-**Status**: Pending
+**Status**: Complete
 
-### Test files to create:
+### Test files created:
 - `tests/test_content_storage.py` - Content-addressed storage functions
-- `tests/test_xblob.py` - XBlobType roundtrip
 - `tests/test_type_composition.py` - Type chain encoding/decoding
 - `tests/test_gc.py` - Garbage collection
+- `tests/test_attribute_type.py` - AttributeType registry and DJBlobType (existing)
 
 ---
 
@@ -415,7 +399,10 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict:
 | `src/datajoint/table.py` | ✅ | Type chain encoding on insert |
 | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch |
 | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization |
-| `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) |
+| `src/datajoint/gc.py` | ✅ | Garbage collection for content storage |
+| `tests/test_content_storage.py` | ✅ | Tests for content_registry.py |
+| `tests/test_type_composition.py` | ✅ | Tests for type chain encoding/decoding |
+| `tests/test_gc.py` | ✅ | Tests for garbage collection |
 
 ---
 
diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py
new file mode 100644
index 000000000..e862287fc
--- /dev/null
+++ b/src/datajoint/gc.py
@@ -0,0 +1,327 @@
+"""
+Garbage collection for content-addressed storage.
+
+This module provides utilities to identify and remove orphaned content
+from external storage. Content becomes orphaned when all database rows
+referencing it are deleted.
+
+Usage:
+    import datajoint as dj
+
+    # Scan schemas and find orphaned content
+    stats = dj.gc.scan(schema1, schema2, store_name='mystore')
+
+    # Remove orphaned content (dry_run=False to actually delete)
+    stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import TYPE_CHECKING, Any
+
+from .content_registry import delete_content, get_store_backend
+from .errors import DataJointError
+
+if TYPE_CHECKING:
+    from .schemas import Schema
+
+logger = logging.getLogger(__name__.split(".")[0])
+
+
+def _uses_content_storage(attr) -> bool:
+    """
+    Check if an attribute uses content-addressed storage.
+
+    This includes types that compose with <content>:
+    - <content> directly
+    - <xblob> (composes with <content>)
+    - <xattach> (composes with <content>)
+
+    Args:
+        attr: Attribute from table heading
+
+    Returns:
+        True if the attribute stores content hashes
+    """
+    if not attr.adapter:
+        return False
+
+    # Check if this type or its composition chain uses content storage
+    type_name = getattr(attr.adapter, "type_name", "")
+    return type_name in ("content", "xblob", "xattach")
+
+
+def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]:
+    """
+    Extract content references from a stored value.
+
+    Args:
+        value: The stored value (could be JSON string or dict)
+
+    Returns:
+        List of (content_hash, store_name) tuples
+    """
+    refs = []
+
+    if value is None:
+        return refs
+
+    # Parse JSON if string
+    if isinstance(value, str):
+        try:
+            value = json.loads(value)
+        except (json.JSONDecodeError, TypeError):
+            return refs
+
+    # Extract hash from dict
+    if isinstance(value, dict) and "hash" in value:
+        refs.append((value["hash"], value.get("store")))
+
+    return refs
+
+
+def scan_references(
+    *schemas: "Schema",
+    store_name: str | None = None,
+    verbose: bool = False,
+) -> set[str]:
+    """
+    Scan schemas for content references.
+
+    Examines all tables in the given schemas and extracts content hashes
+    from columns that use content-addressed storage (<content>, <xblob>, <xattach>).
+
+    Args:
+        *schemas: Schema instances to scan
+        store_name: Only include references to this store (None = all stores)
+        verbose: Print progress information
+
+    Returns:
+        Set of content hashes that are referenced
+    """
+    referenced: set[str] = set()
+
+    for schema in schemas:
+        if verbose:
+            logger.info(f"Scanning schema: {schema.database}")
+
+        # Get all tables in schema
+        for table_name in schema.list_tables():
+            try:
+                # Get table class
+                table = schema.spawn_table(table_name)
+
+                # Check each attribute for content storage
+                for attr_name, attr in table.heading.attributes.items():
+                    if not _uses_content_storage(attr):
+                        continue
+
+                    if verbose:
+                        logger.info(f"  Scanning {table_name}.{attr_name}")
+
+                    # Fetch all values for this attribute
+                    # Use raw fetch to get JSON strings
+                    try:
+                        values = table.fetch(attr_name)
+                        for value in values:
+                            for content_hash, ref_store in _extract_content_refs(value):
+                                # Filter by store if specified
+                                if store_name is None or ref_store == store_name:
+                                    referenced.add(content_hash)
+                    except Exception as e:
+                        logger.warning(f"Error scanning {table_name}.{attr_name}: {e}")
+
+            except Exception as e:
+                logger.warning(f"Error accessing table {table_name}: {e}")
+
+    return referenced
+
+
+def list_stored_content(store_name: str | None = None) -> dict[str, int]:
+    """
+    List all content hashes in storage.
+
+    Scans the _content/ directory in the specified store and returns
+    all content hashes found.
+
+    Args:
+        store_name: Store to scan (None = default store)
+
+    Returns:
+        Dict mapping content_hash to size in bytes
+    """
+    backend = get_store_backend(store_name)
+    stored: dict[str, int] = {}
+
+    # Content is stored at _content/{hash[:2]}/{hash[2:4]}/{hash}
+    content_prefix = "_content/"
+
+    try:
+        # List all files under _content/
+        full_prefix = backend._full_path(content_prefix)
+
+        for root, dirs, files in backend.fs.walk(full_prefix):
+            for filename in files:
+                # Skip manifest files
+                if filename.endswith(".manifest.json"):
+                    continue
+
+                # The filename is the full hash
+                content_hash = filename
+
+                # Validate it looks like a hash (64 hex chars)
+                if len(content_hash) == 64 and all(c in "0123456789abcdef" for c in content_hash):
+                    try:
+                        file_path = f"{root}/{filename}"
+                        size = backend.fs.size(file_path)
+                        stored[content_hash] = size
+                    except Exception:
+                        stored[content_hash] = 0
+
+    except FileNotFoundError:
+        # No _content/ directory exists yet
+        pass
+    except Exception as e:
+        logger.warning(f"Error listing stored content: {e}")
+
+    return stored
+
+
+def scan(
+    *schemas: "Schema",
+    store_name: str | None = None,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Scan for orphaned content without deleting.
+
+    Args:
+        *schemas: Schema instances to scan
+        store_name: Store to check (None = default store)
+        verbose: Print progress information
+
+    Returns:
+        Dict with scan statistics:
+        - referenced: Number of content items referenced in database
+        - stored: Number of content items in storage
+        - orphaned: Number of unreferenced content items
+        - orphaned_bytes: Total size of orphaned content
+        - orphaned_hashes: List of orphaned content hashes
+    """
+    if not schemas:
+        raise DataJointError("At least one schema must be provided")
+
+    # Find all referenced content
+    referenced = scan_references(*schemas, store_name=store_name, verbose=verbose)
+
+    # Find all stored content
+    stored = list_stored_content(store_name)
+
+    # Find orphaned content
+    orphaned_hashes = set(stored.keys()) - referenced
+    orphaned_bytes = sum(stored.get(h, 0) for h in orphaned_hashes)
+
+    return {
+        "referenced": len(referenced),
+        "stored": len(stored),
+        "orphaned": len(orphaned_hashes),
+        "orphaned_bytes": orphaned_bytes,
+        "orphaned_hashes": sorted(orphaned_hashes),
+    }
+
+
+def collect(
+    *schemas: "Schema",
+    store_name: str | None = None,
+    dry_run: bool = True,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Remove orphaned content from storage.
+
+    Scans the given schemas for content references, then removes any
+    content in storage that is not referenced.
+
+    Args:
+        *schemas: Schema instances to scan
+        store_name: Store to clean (None = default store)
+        dry_run: If True, report what would be deleted without deleting
+        verbose: Print progress information
+
+    Returns:
+        Dict with collection statistics:
+        - referenced: Number of content items referenced in database
+        - stored: Number of content items in storage
+        - orphaned: Number of unreferenced content items
+        - deleted: Number of items deleted (0 if dry_run)
+        - bytes_freed: Bytes freed (0 if dry_run)
+        - errors: Number of deletion errors
+    """
+    # First scan to find orphaned content
+    stats = scan(*schemas, store_name=store_name, verbose=verbose)
+
+    deleted = 0
+    bytes_freed = 0
+    errors = 0
+
+    if not dry_run and stats["orphaned"] > 0:
+        stored = list_stored_content(store_name)
+
+        for content_hash in stats["orphaned_hashes"]:
+            try:
+                size = stored.get(content_hash, 0)
+                if delete_content(content_hash, store_name):
+                    deleted += 1
+                    bytes_freed += size
+                    if verbose:
+                        logger.info(f"Deleted: {content_hash[:16]}... ({size} bytes)")
+            except Exception as e:
+                errors += 1
+                logger.warning(f"Failed to delete {content_hash[:16]}...: {e}")
+
+    return {
+        "referenced": stats["referenced"],
+        "stored": stats["stored"],
+        "orphaned": stats["orphaned"],
+        "deleted": deleted,
+        "bytes_freed": bytes_freed,
+        "errors": errors,
+        "dry_run": dry_run,
+    }
+
+
+def format_stats(stats: dict[str, Any]) -> str:
+    """
+    Format GC statistics as a human-readable string.
+
+    Args:
+        stats: Statistics dict from scan() or collect()
+
+    Returns:
+        Formatted string
+    """
+    lines = [
+        "Content Storage Statistics:",
+        f"  Referenced in database: {stats['referenced']}",
+        f"  Stored in backend:      {stats['stored']}",
+        f"  Orphaned (unreferenced): {stats['orphaned']}",
+    ]
+
+    if "orphaned_bytes" in stats:
+        size_mb = stats["orphaned_bytes"] / (1024 * 1024)
+        lines.append(f"  Orphaned size:          {size_mb:.2f} MB")
+
+    if "deleted" in stats:
+        lines.append("")
+        if stats.get("dry_run", True):
+            lines.append("  [DRY RUN - no changes made]")
+        else:
+            lines.append(f"  Deleted:     {stats['deleted']}")
+            freed_mb = stats["bytes_freed"] / (1024 * 1024)
+            lines.append(f"  Bytes freed: {freed_mb:.2f} MB")
+            if stats.get("errors", 0) > 0:
+                lines.append(f"  Errors:      {stats['errors']}")
+
+    return "\n".join(lines)
diff --git a/tests/test_content_storage.py b/tests/test_content_storage.py
new file mode 100644
index 000000000..e6d0f14cc
--- /dev/null
+++ b/tests/test_content_storage.py
@@ -0,0 +1,231 @@
+"""
+Tests for content-addressed storage (content_registry.py).
+"""
+
+import hashlib
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from datajoint.content_registry import (
+    build_content_path,
+    compute_content_hash,
+    content_exists,
+    delete_content,
+    get_content,
+    get_content_size,
+    put_content,
+)
+from datajoint.errors import DataJointError
+
+
+class TestComputeContentHash:
+    """Tests for compute_content_hash function."""
+
+    def test_computes_sha256(self):
+        """Test that SHA256 hash is computed correctly."""
+        data = b"Hello, World!"
+        result = compute_content_hash(data)
+
+        # Verify against known SHA256 hash
+        expected = hashlib.sha256(data).hexdigest()
+        assert result == expected
+        assert len(result) == 64  # SHA256 produces 64 hex chars
+
+    def test_empty_bytes(self):
+        """Test hashing empty bytes."""
+        result = compute_content_hash(b"")
+        expected = hashlib.sha256(b"").hexdigest()
+        assert result == expected
+
+    def test_different_content_different_hash(self):
+        """Test that different content produces different hashes."""
+        hash1 = compute_content_hash(b"content1")
+        hash2 = compute_content_hash(b"content2")
+        assert hash1 != hash2
+
+    def test_same_content_same_hash(self):
+        """Test that same content produces same hash."""
+        data = b"identical content"
+        hash1 = compute_content_hash(data)
+        hash2 = compute_content_hash(data)
+        assert hash1 == hash2
+
+
+class TestBuildContentPath:
+    """Tests for build_content_path function."""
+
+    def test_builds_hierarchical_path(self):
+        """Test that path is built with proper hierarchy."""
+        # Example hash: abcdef...
+        test_hash = "abcdef0123456789" * 4  # 64 chars
+        result = build_content_path(test_hash)
+
+        # Path should be _content/{hash[:2]}/{hash[2:4]}/{hash}
+        assert result == f"_content/ab/cd/{test_hash}"
+
+    def test_rejects_invalid_hash_length(self):
+        """Test that invalid hash length raises error."""
+        with pytest.raises(DataJointError, match="Invalid content hash length"):
+            build_content_path("tooshort")
+
+        with pytest.raises(DataJointError, match="Invalid content hash length"):
+            build_content_path("a" * 65)  # Too long
+
+    def test_real_hash_path(self):
+        """Test path building with a real computed hash."""
+        data = b"test content"
+        content_hash = compute_content_hash(data)
+        path = build_content_path(content_hash)
+
+        # Verify structure
+        parts = path.split("/")
+        assert parts[0] == "_content"
+        assert len(parts[1]) == 2
+        assert len(parts[2]) == 2
+        assert len(parts[3]) == 64
+        assert parts[1] == content_hash[:2]
+        assert parts[2] == content_hash[2:4]
+        assert parts[3] == content_hash
+
+
+class TestPutContent:
+    """Tests for put_content function."""
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_stores_new_content(self, mock_get_backend):
+        """Test storing new content."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = False
+        mock_get_backend.return_value = mock_backend
+
+        data = b"new content"
+        result = put_content(data, store_name="test_store")
+
+        # Verify return value
+        assert "hash" in result
+        assert result["hash"] == compute_content_hash(data)
+        assert result["store"] == "test_store"
+        assert result["size"] == len(data)
+
+        # Verify backend was called
+        mock_backend.put_buffer.assert_called_once()
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_deduplicates_existing_content(self, mock_get_backend):
+        """Test that existing content is not re-uploaded."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = True  # Content already exists
+        mock_get_backend.return_value = mock_backend
+
+        data = b"existing content"
+        result = put_content(data, store_name="test_store")
+
+        # Verify return value is still correct
+        assert result["hash"] == compute_content_hash(data)
+        assert result["size"] == len(data)
+
+        # Verify put_buffer was NOT called (deduplication)
+        mock_backend.put_buffer.assert_not_called()
+
+
+class TestGetContent:
+    """Tests for get_content function."""
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_retrieves_content(self, mock_get_backend):
+        """Test retrieving content by hash."""
+        data = b"stored content"
+        content_hash = compute_content_hash(data)
+
+        mock_backend = MagicMock()
+        mock_backend.get_buffer.return_value = data
+        mock_get_backend.return_value = mock_backend
+
+        result = get_content(content_hash, store_name="test_store")
+
+        assert result == data
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_verifies_hash(self, mock_get_backend):
+        """Test that hash is verified on retrieval."""
+        data = b"original content"
+        content_hash = compute_content_hash(data)
+
+        # Return corrupted data
+        mock_backend = MagicMock()
+        mock_backend.get_buffer.return_value = b"corrupted content"
+        mock_get_backend.return_value = mock_backend
+
+        with pytest.raises(DataJointError, match="Content hash mismatch"):
+            get_content(content_hash, store_name="test_store")
+
+
+class TestContentExists:
+    """Tests for content_exists function."""
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_returns_true_when_exists(self, mock_get_backend):
+        """Test that True is returned when content exists."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = True
+        mock_get_backend.return_value = mock_backend
+
+        content_hash = "a" * 64
+        assert content_exists(content_hash, store_name="test_store") is True
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_returns_false_when_not_exists(self, mock_get_backend):
+        """Test that False is returned when content doesn't exist."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = False
+        mock_get_backend.return_value = mock_backend
+
+        content_hash = "a" * 64
+        assert content_exists(content_hash, store_name="test_store") is False
+
+
+class TestDeleteContent:
+    """Tests for delete_content function."""
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_deletes_existing_content(self, mock_get_backend):
+        """Test deleting existing content."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = True
+        mock_get_backend.return_value = mock_backend
+
+        content_hash = "a" * 64
+        result = delete_content(content_hash, store_name="test_store")
+
+        assert result is True
+        mock_backend.remove.assert_called_once()
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_returns_false_for_nonexistent(self, mock_get_backend):
+        """Test that False is returned when content doesn't exist."""
+        mock_backend = MagicMock()
+        mock_backend.exists.return_value = False
+        mock_get_backend.return_value = mock_backend
+
+        content_hash = "a" * 64
+        result = delete_content(content_hash, store_name="test_store")
+
+        assert result is False
+        mock_backend.remove.assert_not_called()
+
+
+class TestGetContentSize:
+    """Tests for get_content_size function."""
+
+    @patch("datajoint.content_registry.get_store_backend")
+    def test_returns_size(self, mock_get_backend):
+        """Test getting content size."""
+        mock_backend = MagicMock()
+        mock_backend.size.return_value = 1024
+        mock_get_backend.return_value = mock_backend
+
+        content_hash = "a" * 64
+        result = get_content_size(content_hash, store_name="test_store")
+
+        assert result == 1024
diff --git a/tests/test_gc.py b/tests/test_gc.py
new file mode 100644
index 000000000..5af71a0a9
--- /dev/null
+++ b/tests/test_gc.py
@@ -0,0 +1,214 @@
+"""
+Tests for garbage collection (gc.py).
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from datajoint import gc
+from datajoint.errors import DataJointError
+
+
+class TestUsesContentStorage:
+    """Tests for _uses_content_storage helper function."""
+
+    def test_returns_false_for_no_adapter(self):
+        """Test that False is returned when attribute has no adapter."""
+        attr = MagicMock()
+        attr.adapter = None
+
+        assert gc._uses_content_storage(attr) is False
+
+    def test_returns_true_for_content_type(self):
+        """Test that True is returned for <content> type."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "content"
+
+        assert gc._uses_content_storage(attr) is True
+
+    def test_returns_true_for_xblob_type(self):
+        """Test that True is returned for <xblob> type."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "xblob"
+
+        assert gc._uses_content_storage(attr) is True
+
+    def test_returns_true_for_xattach_type(self):
+        """Test that True is returned for <xattach> type."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "xattach"
+
+        assert gc._uses_content_storage(attr) is True
+
+    def test_returns_false_for_other_types(self):
+        """Test that False is returned for non-content types."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "djblob"
+
+        assert gc._uses_content_storage(attr) is False
+
+
+class TestExtractContentRefs:
+    """Tests for _extract_content_refs helper function."""
+
+    def test_returns_empty_for_none(self):
+        """Test that empty list is returned for None value."""
+        assert gc._extract_content_refs(None) == []
+
+    def test_parses_json_string(self):
+        """Test parsing JSON string with hash."""
+        value = '{"hash": "abc123", "store": "mystore"}'
+        refs = gc._extract_content_refs(value)
+
+        assert len(refs) == 1
+        assert refs[0] == ("abc123", "mystore")
+
+    def test_parses_dict_directly(self):
+        """Test parsing dict with hash."""
+        value = {"hash": "def456", "store": None}
+        refs = gc._extract_content_refs(value)
+
+        assert len(refs) == 1
+        assert refs[0] == ("def456", None)
+
+    def test_returns_empty_for_invalid_json(self):
+        """Test that empty list is returned for invalid JSON."""
+        assert gc._extract_content_refs("not json") == []
+
+    def test_returns_empty_for_dict_without_hash(self):
+        """Test that empty list is returned for dict without hash key."""
+        assert gc._extract_content_refs({"other": "data"}) == []
+
+
+class TestScan:
+    """Tests for scan function."""
+
+    def test_requires_at_least_one_schema(self):
+        """Test that at least one schema is required."""
+        with pytest.raises(DataJointError, match="At least one schema must be provided"):
+            gc.scan()
+
+    @patch("datajoint.gc.scan_references")
+    @patch("datajoint.gc.list_stored_content")
+    def test_returns_stats(self, mock_list_stored, mock_scan_refs):
+        """Test that scan returns proper statistics."""
+        # Mock referenced hashes
+        mock_scan_refs.return_value = {"hash1", "hash2"}
+
+        # Mock stored content (hash1 referenced, hash3 orphaned)
+        mock_list_stored.return_value = {
+            "hash1": 100,
+            "hash3": 200,
+        }
+
+        mock_schema = MagicMock()
+        stats = gc.scan(mock_schema, store_name="test_store")
+
+        assert stats["referenced"] == 2
+        assert stats["stored"] == 2
+        assert stats["orphaned"] == 1
+        assert stats["orphaned_bytes"] == 200
+        assert "hash3" in stats["orphaned_hashes"]
+
+
+class TestCollect:
+    """Tests for collect function."""
+
+    @patch("datajoint.gc.scan")
+    def test_dry_run_does_not_delete(self, mock_scan):
+        """Test that dry_run=True doesn't delete anything."""
+        mock_scan.return_value = {
+            "referenced": 1,
+            "stored": 2,
+            "orphaned": 1,
+            "orphaned_bytes": 100,
+            "orphaned_hashes": ["orphan_hash"],
+        }
+
+        mock_schema = MagicMock()
+        stats = gc.collect(mock_schema, store_name="test_store", dry_run=True)
+
+        assert stats["deleted"] == 0
+        assert stats["bytes_freed"] == 0
+        assert stats["dry_run"] is True
+
+    @patch("datajoint.gc.delete_content")
+    @patch("datajoint.gc.list_stored_content")
+    @patch("datajoint.gc.scan")
+    def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete):
+        """Test that orphaned content is deleted when dry_run=False."""
+        mock_scan.return_value = {
+            "referenced": 1,
+            "stored": 2,
+            "orphaned": 1,
+            "orphaned_bytes": 100,
+            "orphaned_hashes": ["orphan_hash"],
+        }
+        mock_list_stored.return_value = {"orphan_hash": 100}
+        mock_delete.return_value = True
+
+        mock_schema = MagicMock()
+        stats = gc.collect(mock_schema, store_name="test_store", dry_run=False)
+
+        assert stats["deleted"] == 1
+        assert stats["bytes_freed"] == 100
+        assert stats["dry_run"] is False
+        mock_delete.assert_called_once_with("orphan_hash", "test_store")
+
+
+class TestFormatStats:
+    """Tests for format_stats function."""
+
+    def test_formats_scan_stats(self):
+        """Test formatting scan statistics."""
+        stats = {
+            "referenced": 10,
+            "stored": 15,
+            "orphaned": 5,
+            "orphaned_bytes": 1024 * 1024,  # 1 MB
+        }
+
+        result = gc.format_stats(stats)
+
+        assert "Referenced in database: 10" in result
+        assert "Stored in backend:      15" in result
+        assert "Orphaned (unreferenced): 5" in result
+        assert "1.00 MB" in result
+
+    def test_formats_collect_stats_dry_run(self):
+        """Test formatting collect statistics with dry_run."""
+        stats = {
+            "referenced": 10,
+            "stored": 15,
+            "orphaned": 5,
+            "deleted": 0,
+            "bytes_freed": 0,
+            "dry_run": True,
+        }
+
+        result = gc.format_stats(stats)
+
+        assert "DRY RUN" in result
+
+    def test_formats_collect_stats_actual(self):
+        """Test formatting collect statistics after actual deletion."""
+        stats = {
+            "referenced": 10,
+            "stored": 15,
+            "orphaned": 5,
+            "deleted": 3,
+            "bytes_freed": 2 * 1024 * 1024,  # 2 MB
+            "errors": 2,
+            "dry_run": False,
+        }
+
+        result = gc.format_stats(stats)
+
+        assert "Deleted:     3" in result
+        assert "2.00 MB" in result
+        assert "Errors:      2" in result
diff --git a/tests/test_type_composition.py b/tests/test_type_composition.py
new file mode 100644
index 000000000..0b51b3d68
--- /dev/null
+++ b/tests/test_type_composition.py
@@ -0,0 +1,352 @@
+"""
+Tests for type composition (type chain encoding/decoding).
+
+This tests the <xblob> → <content> → json composition pattern
+and similar type chains.
+"""
+
+from datajoint.attribute_type import (
+    AttributeType,
+    _type_registry,
+    register_type,
+    resolve_dtype,
+)
+
+
+class TestTypeChainResolution:
+    """Tests for resolving type chains."""
+
+    def setup_method(self):
+        """Clear test types from registry before each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def teardown_method(self):
+        """Clean up test types after each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def test_single_type_chain(self):
+        """Test resolving a single-type chain."""
+
+        @register_type
+        class TestSingle(AttributeType):
+            type_name = "test_single"
+            dtype = "varchar(100)"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return str(value)
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        final_dtype, chain, store = resolve_dtype("<test_single>")
+
+        assert final_dtype == "varchar(100)"
+        assert len(chain) == 1
+        assert chain[0].type_name == "test_single"
+        assert store is None
+
+    def test_two_type_chain(self):
+        """Test resolving a two-type chain."""
+
+        @register_type
+        class TestInner(AttributeType):
+            type_name = "test_inner"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class TestOuter(AttributeType):
+            type_name = "test_outer"
+            dtype = "<test_inner>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        final_dtype, chain, store = resolve_dtype("<test_outer>")
+
+        assert final_dtype == "longblob"
+        assert len(chain) == 2
+        assert chain[0].type_name == "test_outer"
+        assert chain[1].type_name == "test_inner"
+
+    def test_three_type_chain(self):
+        """Test resolving a three-type chain."""
+
+        @register_type
+        class TestBase(AttributeType):
+            type_name = "test_base"
+            dtype = "json"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class TestMiddle(AttributeType):
+            type_name = "test_middle"
+            dtype = "<test_base>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class TestTop(AttributeType):
+            type_name = "test_top"
+            dtype = "<test_middle>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        final_dtype, chain, store = resolve_dtype("<test_top>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 3
+        assert chain[0].type_name == "test_top"
+        assert chain[1].type_name == "test_middle"
+        assert chain[2].type_name == "test_base"
+
+
+class TestTypeChainEncodeDecode:
+    """Tests for encode/decode through type chains."""
+
+    def setup_method(self):
+        """Clear test types from registry before each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def teardown_method(self):
+        """Clean up test types after each test."""
+        for name in list(_type_registry.keys()):
+            if name.startswith("test_"):
+                del _type_registry[name]
+
+    def test_encode_order(self):
+        """Test that encode is applied outer → inner."""
+        encode_order = []
+
+        @register_type
+        class TestInnerEnc(AttributeType):
+            type_name = "test_inner_enc"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None, store_name=None):
+                encode_order.append("inner")
+                return value + b"_inner"
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        @register_type
+        class TestOuterEnc(AttributeType):
+            type_name = "test_outer_enc"
+            dtype = "<test_inner_enc>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                encode_order.append("outer")
+                return value + b"_outer"
+
+            def decode(self, stored, *, key=None):
+                return stored
+
+        _, chain, _ = resolve_dtype("<test_outer_enc>")
+
+        # Apply encode in order: outer first, then inner
+        value = b"start"
+        for attr_type in chain:
+            value = attr_type.encode(value)
+
+        assert encode_order == ["outer", "inner"]
+        assert value == b"start_outer_inner"
+
+    def test_decode_order(self):
+        """Test that decode is applied inner → outer (reverse of encode)."""
+        decode_order = []
+
+        @register_type
+        class TestInnerDec(AttributeType):
+            type_name = "test_inner_dec"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                decode_order.append("inner")
+                return stored.replace(b"_inner", b"")
+
+        @register_type
+        class TestOuterDec(AttributeType):
+            type_name = "test_outer_dec"
+            dtype = "<test_inner_dec>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                return value
+
+            def decode(self, stored, *, key=None):
+                decode_order.append("outer")
+                return stored.replace(b"_outer", b"")
+
+        _, chain, _ = resolve_dtype("<test_outer_dec>")
+
+        # Apply decode in reverse order: inner first, then outer
+        value = b"start_outer_inner"
+        for attr_type in reversed(chain):
+            value = attr_type.decode(value)
+
+        assert decode_order == ["inner", "outer"]
+        assert value == b"start"
+
+    def test_roundtrip(self):
+        """Test encode/decode roundtrip through a type chain."""
+
+        @register_type
+        class TestInnerRt(AttributeType):
+            type_name = "test_inner_rt"
+            dtype = "longblob"
+
+            def encode(self, value, *, key=None, store_name=None):
+                # Compress (just add prefix for testing)
+                return b"COMPRESSED:" + value
+
+            def decode(self, stored, *, key=None):
+                # Decompress
+                return stored.replace(b"COMPRESSED:", b"")
+
+        @register_type
+        class TestOuterRt(AttributeType):
+            type_name = "test_outer_rt"
+            dtype = "<test_inner_rt>"
+
+            def encode(self, value, *, key=None, store_name=None):
+                # Serialize (just encode string for testing)
+                return str(value).encode("utf-8")
+
+            def decode(self, stored, *, key=None):
+                # Deserialize
+                return stored.decode("utf-8")
+
+        _, chain, _ = resolve_dtype("<test_outer_rt>")
+
+        # Original value
+        original = "test data"
+
+        # Encode: outer → inner
+        encoded = original
+        for attr_type in chain:
+            encoded = attr_type.encode(encoded)
+
+        assert encoded == b"COMPRESSED:test data"
+
+        # Decode: inner → outer (reversed)
+        decoded = encoded
+        for attr_type in reversed(chain):
+            decoded = attr_type.decode(decoded)
+
+        assert decoded == original
+
+
+class TestBuiltinTypeComposition:
+    """Tests for built-in type composition."""
+
+    def test_xblob_resolves_to_json(self):
+        """Test that <xblob> → <content> → json."""
+        final_dtype, chain, _ = resolve_dtype("<xblob>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 2
+        assert chain[0].type_name == "xblob"
+        assert chain[1].type_name == "content"
+
+    def test_xattach_resolves_to_json(self):
+        """Test that <xattach> → <content> → json."""
+        final_dtype, chain, _ = resolve_dtype("<xattach>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 2
+        assert chain[0].type_name == "xattach"
+        assert chain[1].type_name == "content"
+
+    def test_djblob_resolves_to_longblob(self):
+        """Test that <djblob> → longblob (no chain)."""
+        final_dtype, chain, _ = resolve_dtype("<djblob>")
+
+        assert final_dtype == "longblob"
+        assert len(chain) == 1
+        assert chain[0].type_name == "djblob"
+
+    def test_content_resolves_to_json(self):
+        """Test that <content> → json."""
+        final_dtype, chain, _ = resolve_dtype("<content>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 1
+        assert chain[0].type_name == "content"
+
+    def test_object_resolves_to_json(self):
+        """Test that <object> → json."""
+        final_dtype, chain, _ = resolve_dtype("<object>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 1
+        assert chain[0].type_name == "object"
+
+    def test_attach_resolves_to_longblob(self):
+        """Test that <attach> → longblob."""
+        final_dtype, chain, _ = resolve_dtype("<attach>")
+
+        assert final_dtype == "longblob"
+        assert len(chain) == 1
+        assert chain[0].type_name == "attach"
+
+    def test_filepath_resolves_to_json(self):
+        """Test that <filepath> → json."""
+        final_dtype, chain, _ = resolve_dtype("<filepath>")
+
+        assert final_dtype == "json"
+        assert len(chain) == 1
+        assert chain[0].type_name == "filepath"
+
+
+class TestStoreNameParsing:
+    """Tests for store name parsing in type specs."""
+
+    def test_type_with_store(self):
+        """Test parsing type with store name."""
+        final_dtype, chain, store = resolve_dtype("<xblob@mystore>")
+
+        assert final_dtype == "json"
+        assert store == "mystore"
+
+    def test_type_without_store(self):
+        """Test parsing type without store name."""
+        final_dtype, chain, store = resolve_dtype("<xblob>")
+
+        assert store is None
+
+    def test_filepath_with_store(self):
+        """Test parsing filepath with store name."""
+        final_dtype, chain, store = resolve_dtype("<filepath@s3store>")
+
+        assert final_dtype == "json"
+        assert store == "s3store"

From 73535de8790491dee128ba5cffde2be5114f2107 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 26 Dec 2025 02:22:38 +0000
Subject: [PATCH 39/41] Add object type garbage collection support

Extend gc.py to handle both storage patterns:
- Content-addressed storage: <content>, <xblob>, <xattach>
- Path-addressed storage: <object>

New functions added:
- _uses_object_storage() - detect object type attributes
- _extract_object_refs() - extract path refs from JSON
- scan_object_references() - scan schemas for object paths
- list_stored_objects() - list all objects in storage
- delete_object() - delete object directory tree

Updated scan() and collect() to handle both storage types,
with combined and per-type statistics in the output.

Updated tests for new statistics format.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 .../storage-types-implementation-plan.md      |  31 +-
 src/datajoint/gc.py                           | 360 +++++++++++++++---
 tests/test_gc.py                              | 143 ++++++-
 3 files changed, 467 insertions(+), 67 deletions(-)

diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md
index 8ce582f57..c15a2292c 100644
--- a/docs/src/design/tables/storage-types-implementation-plan.md
+++ b/docs/src/design/tables/storage-types-implementation-plan.md
@@ -346,29 +346,42 @@ def _get(connection, attr, data, squeeze, download_path):
 ```python
 import datajoint as dj
 
-# Scan schemas and find orphaned content
+# Scan schemas and find orphaned content/objects
 stats = dj.gc.scan(schema1, schema2, store_name='mystore')
 
-# Remove orphaned content (dry_run=False to actually delete)
+# Remove orphaned content/objects (dry_run=False to actually delete)
 stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True)
 
 # Format statistics for display
 print(dj.gc.format_stats(stats))
 ```
 
+**Supported storage patterns:**
+
+1. **Content-Addressed Storage** (`<content>`, `<xblob>`, `<xattach>`):
+   - Stored at: `_content/{hash[:2]}/{hash[2:4]}/{hash}`
+   - Referenced by SHA256 hash in JSON metadata
+
+2. **Path-Addressed Storage** (`<object>`):
+   - Stored at: `{schema}/{table}/objects/{pk}/{field}_{token}/`
+   - Referenced by path in JSON metadata
+
 **Key functions:**
 - `scan_references(*schemas, store_name=None)` - Scan tables for content hashes
+- `scan_object_references(*schemas, store_name=None)` - Scan tables for object paths
 - `list_stored_content(store_name=None)` - List all content in `_content/` directory
-- `scan(*schemas, store_name=None)` - Find orphaned content without deleting
-- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content
+- `list_stored_objects(store_name=None)` - List all objects in `*/objects/` directories
+- `scan(*schemas, store_name=None)` - Find orphaned content/objects without deleting
+- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content/objects
+- `delete_object(path, store_name=None)` - Delete an object directory
 - `format_stats(stats)` - Human-readable statistics output
 
 **GC Process:**
-1. Scan all tables in provided schemas for content-type attributes
-2. Extract content hashes from JSON metadata in those columns
-3. Scan storage `_content/` directory for all stored hashes
-4. Compute orphaned = stored - referenced
-5. Optionally delete orphaned content (when `dry_run=False`)
+1. Scan all tables in provided schemas for content-type and object-type attributes
+2. Extract content hashes and object paths from JSON metadata columns
+3. Scan storage for all stored content (`_content/`) and objects (`*/objects/`)
+4. Compute orphaned = stored - referenced (for both types)
+5. Optionally delete orphaned items (when `dry_run=False`)
 
 ---
 
diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py
index e862287fc..e0b7aaafe 100644
--- a/src/datajoint/gc.py
+++ b/src/datajoint/gc.py
@@ -1,10 +1,17 @@
 """
-Garbage collection for content-addressed storage.
+Garbage collection for external storage.
 
 This module provides utilities to identify and remove orphaned content
 from external storage. Content becomes orphaned when all database rows
 referencing it are deleted.
 
+Supports two storage patterns:
+- Content-addressed storage: <content>, <xblob>, <xattach>
+  Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash}
+
+- Path-addressed storage: <object>
+  Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/
+
 Usage:
     import datajoint as dj
 
@@ -53,6 +60,23 @@ def _uses_content_storage(attr) -> bool:
     return type_name in ("content", "xblob", "xattach")
 
 
+def _uses_object_storage(attr) -> bool:
+    """
+    Check if an attribute uses path-addressed object storage.
+
+    Args:
+        attr: Attribute from table heading
+
+    Returns:
+        True if the attribute stores object paths
+    """
+    if not attr.adapter:
+        return False
+
+    type_name = getattr(attr.adapter, "type_name", "")
+    return type_name == "object"
+
+
 def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]:
     """
     Extract content references from a stored value.
@@ -82,6 +106,35 @@ def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]:
     return refs
 
 
+def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]:
+    """
+    Extract object path references from a stored value.
+
+    Args:
+        value: The stored value (could be JSON string or dict)
+
+    Returns:
+        List of (path, store_name) tuples
+    """
+    refs = []
+
+    if value is None:
+        return refs
+
+    # Parse JSON if string
+    if isinstance(value, str):
+        try:
+            value = json.loads(value)
+        except (json.JSONDecodeError, TypeError):
+            return refs
+
+    # Extract path from dict
+    if isinstance(value, dict) and "path" in value:
+        refs.append((value["path"], value.get("store")))
+
+    return refs
+
+
 def scan_references(
     *schemas: "Schema",
     store_name: str | None = None,
@@ -139,6 +192,62 @@ def scan_references(
     return referenced
 
 
+def scan_object_references(
+    *schemas: "Schema",
+    store_name: str | None = None,
+    verbose: bool = False,
+) -> set[str]:
+    """
+    Scan schemas for object path references.
+
+    Examines all tables in the given schemas and extracts object paths
+    from columns that use path-addressed storage (<object>).
+
+    Args:
+        *schemas: Schema instances to scan
+        store_name: Only include references to this store (None = all stores)
+        verbose: Print progress information
+
+    Returns:
+        Set of object paths that are referenced
+    """
+    referenced: set[str] = set()
+
+    for schema in schemas:
+        if verbose:
+            logger.info(f"Scanning schema for objects: {schema.database}")
+
+        # Get all tables in schema
+        for table_name in schema.list_tables():
+            try:
+                # Get table class
+                table = schema.spawn_table(table_name)
+
+                # Check each attribute for object storage
+                for attr_name, attr in table.heading.attributes.items():
+                    if not _uses_object_storage(attr):
+                        continue
+
+                    if verbose:
+                        logger.info(f"  Scanning {table_name}.{attr_name}")
+
+                    # Fetch all values for this attribute
+                    try:
+                        values = table.fetch(attr_name)
+                        for value in values:
+                            for path, ref_store in _extract_object_refs(value):
+                                # Filter by store if specified
+                                if store_name is None or ref_store == store_name:
+                                    referenced.add(path)
+                    except Exception as e:
+                        logger.warning(f"Error scanning {table_name}.{attr_name}: {e}")
+
+            except Exception as e:
+                logger.warning(f"Error accessing table {table_name}: {e}")
+
+    return referenced
+
+
 def list_stored_content(store_name: str | None = None) -> dict[str, int]:
     """
     List all content hashes in storage.
@@ -189,13 +298,94 @@ def list_stored_content(store_name: str | None = None) -> dict[str, int]:
     return stored
 
 
+def list_stored_objects(store_name: str | None = None) -> dict[str, int]:
+    """
+    List all object paths in storage.
+
+    Scans for directories matching the object storage pattern:
+    {schema}/{table}/objects/{pk}/{field}_{token}/
+
+    Args:
+        store_name: Store to scan (None = default store)
+
+    Returns:
+        Dict mapping object_path to size in bytes
+    """
+    backend = get_store_backend(store_name)
+    stored: dict[str, int] = {}
+
+    try:
+        # Walk the storage looking for /objects/ directories
+        full_prefix = backend._full_path("")
+
+        for root, dirs, files in backend.fs.walk(full_prefix):
+            # Skip _content directory
+            if "_content" in root:
+                continue
+
+            # Look for "objects" directory pattern
+            if "/objects/" in root:
+                # This could be an object storage path
+                # Path pattern: {schema}/{table}/objects/{pk}/{field}_{token}
+                relative_path = root.replace(full_prefix, "").lstrip("/")
+
+                # Calculate total size of this object directory
+                total_size = 0
+                for file in files:
+                    try:
+                        file_path = f"{root}/{file}"
+                        total_size += backend.fs.size(file_path)
+                    except Exception:
+                        pass
+
+                # Only count directories with files (actual objects)
+                if total_size > 0 or files:
+                    stored[relative_path] = total_size
+
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        logger.warning(f"Error listing stored objects: {e}")
+
+    return stored
+
+
+def delete_object(path: str, store_name: str | None = None) -> bool:
+    """
+    Delete an object directory from storage.
+
+    Args:
+        path: Object path (relative to store root)
+        store_name: Store name (None = default store)
+
+    Returns:
+        True if deleted, False if not found
+    """
+    backend = get_store_backend(store_name)
+
+    try:
+        full_path = backend._full_path(path)
+        if backend.fs.exists(full_path):
+            # Remove entire directory tree
+            backend.fs.rm(full_path, recursive=True)
+            logger.debug(f"Deleted object: {path}")
+            return True
+    except Exception as e:
+        logger.warning(f"Error deleting object {path}: {e}")
+
+    return False
+
+
 def scan(
     *schemas: "Schema",
     store_name: str | None = None,
     verbose: bool = False,
 ) -> dict[str, Any]:
     """
-    Scan for orphaned content without deleting.
+    Scan for orphaned content and objects without deleting.
+
+    Scans both content-addressed storage (for <content>, <xblob>, <xattach>)
+    and path-addressed storage (for <object>).
 
     Args:
         *schemas: Schema instances to scan
@@ -204,31 +394,50 @@ def scan(
 
     Returns:
         Dict with scan statistics:
-        - referenced: Number of content items referenced in database
-        - stored: Number of content items in storage
-        - orphaned: Number of unreferenced content items
-        - orphaned_bytes: Total size of orphaned content
+        - content_referenced: Number of content items referenced in database
+        - content_stored: Number of content items in storage
+        - content_orphaned: Number of unreferenced content items
+        - content_orphaned_bytes: Total size of orphaned content
         - orphaned_hashes: List of orphaned content hashes
+        - object_referenced: Number of objects referenced in database
+        - object_stored: Number of objects in storage
+        - object_orphaned: Number of unreferenced objects
+        - object_orphaned_bytes: Total size of orphaned objects
+        - orphaned_paths: List of orphaned object paths
     """
     if not schemas:
         raise DataJointError("At least one schema must be provided")
 
-    # Find all referenced content
-    referenced = scan_references(*schemas, store_name=store_name, verbose=verbose)
+    # --- Content-addressed storage ---
+    content_referenced = scan_references(*schemas, store_name=store_name, verbose=verbose)
+    content_stored = list_stored_content(store_name)
+    orphaned_hashes = set(content_stored.keys()) - content_referenced
+    content_orphaned_bytes = sum(content_stored.get(h, 0) for h in orphaned_hashes)
 
-    # Find all stored content
-    stored = list_stored_content(store_name)
-
-    # Find orphaned content
-    orphaned_hashes = set(stored.keys()) - referenced
-    orphaned_bytes = sum(stored.get(h, 0) for h in orphaned_hashes)
+    # --- Path-addressed storage (objects) ---
+    object_referenced = scan_object_references(*schemas, store_name=store_name, verbose=verbose)
+    object_stored = list_stored_objects(store_name)
+    orphaned_paths = set(object_stored.keys()) - object_referenced
+    object_orphaned_bytes = sum(object_stored.get(p, 0) for p in orphaned_paths)
 
     return {
-        "referenced": len(referenced),
-        "stored": len(stored),
-        "orphaned": len(orphaned_hashes),
-        "orphaned_bytes": orphaned_bytes,
+        # Content-addressed storage stats
+        "content_referenced": len(content_referenced),
+        "content_stored": len(content_stored),
+        "content_orphaned": len(orphaned_hashes),
+        "content_orphaned_bytes": content_orphaned_bytes,
         "orphaned_hashes": sorted(orphaned_hashes),
+        # Path-addressed storage stats
+        "object_referenced": len(object_referenced),
+        "object_stored": len(object_stored),
+        "object_orphaned": len(orphaned_paths),
+        "object_orphaned_bytes": object_orphaned_bytes,
+        "orphaned_paths": sorted(orphaned_paths),
+        # Combined totals
+        "referenced": len(content_referenced) + len(object_referenced),
+        "stored": len(content_stored) + len(object_stored),
+        "orphaned": len(orphaned_hashes) + len(orphaned_paths),
+        "orphaned_bytes": content_orphaned_bytes + object_orphaned_bytes,
     }
 
 
@@ -239,10 +448,10 @@ def collect(
     verbose: bool = False,
 ) -> dict[str, Any]:
     """
-    Remove orphaned content from storage.
+    Remove orphaned content and objects from storage.
 
-    Scans the given schemas for content references, then removes any
-    content in storage that is not referenced.
+    Scans the given schemas for content and object references, then removes any
+    storage items that are not referenced.
 
     Args:
         *schemas: Schema instances to scan
@@ -252,43 +461,69 @@ def collect(
 
     Returns:
         Dict with collection statistics:
-        - referenced: Number of content items referenced in database
-        - stored: Number of content items in storage
-        - orphaned: Number of unreferenced content items
-        - deleted: Number of items deleted (0 if dry_run)
+        - referenced: Total items referenced in database
+        - stored: Total items in storage
+        - orphaned: Total unreferenced items
+        - content_deleted: Number of content items deleted
+        - object_deleted: Number of object items deleted
+        - deleted: Total items deleted (0 if dry_run)
         - bytes_freed: Bytes freed (0 if dry_run)
         - errors: Number of deletion errors
     """
-    # First scan to find orphaned content
+    # First scan to find orphaned content and objects
     stats = scan(*schemas, store_name=store_name, verbose=verbose)
 
-    deleted = 0
+    content_deleted = 0
+    object_deleted = 0
     bytes_freed = 0
     errors = 0
 
-    if not dry_run and stats["orphaned"] > 0:
-        stored = list_stored_content(store_name)
-
-        for content_hash in stats["orphaned_hashes"]:
-            try:
-                size = stored.get(content_hash, 0)
-                if delete_content(content_hash, store_name):
-                    deleted += 1
-                    bytes_freed += size
-                    if verbose:
-                        logger.info(f"Deleted: {content_hash[:16]}... ({size} bytes)")
-            except Exception as e:
-                errors += 1
-                logger.warning(f"Failed to delete {content_hash[:16]}...: {e}")
+    if not dry_run:
+        # Delete orphaned content (hash-addressed)
+        if stats["content_orphaned"] > 0:
+            content_stored = list_stored_content(store_name)
+
+            for content_hash in stats["orphaned_hashes"]:
+                try:
+                    size = content_stored.get(content_hash, 0)
+                    if delete_content(content_hash, store_name):
+                        content_deleted += 1
+                        bytes_freed += size
+                        if verbose:
+                            logger.info(f"Deleted content: {content_hash[:16]}... ({size} bytes)")
+                except Exception as e:
+                    errors += 1
+                    logger.warning(f"Failed to delete content {content_hash[:16]}...: {e}")
+
+        # Delete orphaned objects (path-addressed)
+        if stats["object_orphaned"] > 0:
+            object_stored = list_stored_objects(store_name)
+
+            for path in stats["orphaned_paths"]:
+                try:
+                    size = object_stored.get(path, 0)
+                    if delete_object(path, store_name):
+                        object_deleted += 1
+                        bytes_freed += size
+                        if verbose:
+                            logger.info(f"Deleted object: {path} ({size} bytes)")
+                except Exception as e:
+                    errors += 1
+                    logger.warning(f"Failed to delete object {path}: {e}")
 
     return {
         "referenced": stats["referenced"],
         "stored": stats["stored"],
         "orphaned": stats["orphaned"],
-        "deleted": deleted,
+        "content_deleted": content_deleted,
+        "object_deleted": object_deleted,
+        "deleted": content_deleted + object_deleted,
         "bytes_freed": bytes_freed,
         "errors": errors,
         "dry_run": dry_run,
+        # Include detailed stats
+        "content_orphaned": stats["content_orphaned"],
+        "object_orphaned": stats["object_orphaned"],
     }
 
 
@@ -302,23 +537,52 @@ def format_stats(stats: dict[str, Any]) -> str:
     Returns:
         Formatted string
     """
-    lines = [
-        "Content Storage Statistics:",
-        f"  Referenced in database: {stats['referenced']}",
-        f"  Stored in backend:      {stats['stored']}",
-        f"  Orphaned (unreferenced): {stats['orphaned']}",
-    ]
+    lines = ["External Storage Statistics:"]
+
+    # Show content-addressed storage stats if present
+    if "content_referenced" in stats:
+        lines.append("")
+        lines.append("Content-Addressed Storage (<content>, <xblob>, <xattach>):")
+        lines.append(f"  Referenced: {stats['content_referenced']}")
+        lines.append(f"  Stored:     {stats['content_stored']}")
+        lines.append(f"  Orphaned:   {stats['content_orphaned']}")
+        if "content_orphaned_bytes" in stats:
+            size_mb = stats["content_orphaned_bytes"] / (1024 * 1024)
+            lines.append(f"  Orphaned size: {size_mb:.2f} MB")
+
+    # Show path-addressed storage stats if present
+    if "object_referenced" in stats:
+        lines.append("")
+        lines.append("Path-Addressed Storage (<object>):")
+        lines.append(f"  Referenced: {stats['object_referenced']}")
+        lines.append(f"  Stored:     {stats['object_stored']}")
+        lines.append(f"  Orphaned:   {stats['object_orphaned']}")
+        if "object_orphaned_bytes" in stats:
+            size_mb = stats["object_orphaned_bytes"] / (1024 * 1024)
+            lines.append(f"  Orphaned size: {size_mb:.2f} MB")
+
+    # Show totals
+    lines.append("")
+    lines.append("Totals:")
+    lines.append(f"  Referenced in database: {stats['referenced']}")
+    lines.append(f"  Stored in backend:      {stats['stored']}")
+    lines.append(f"  Orphaned (unreferenced): {stats['orphaned']}")
 
     if "orphaned_bytes" in stats:
         size_mb = stats["orphaned_bytes"] / (1024 * 1024)
         lines.append(f"  Orphaned size:          {size_mb:.2f} MB")
 
+    # Show deletion results if this is from collect()
     if "deleted" in stats:
         lines.append("")
         if stats.get("dry_run", True):
             lines.append("  [DRY RUN - no changes made]")
         else:
             lines.append(f"  Deleted:     {stats['deleted']}")
+            if "content_deleted" in stats:
+                lines.append(f"    Content: {stats['content_deleted']}")
+            if "object_deleted" in stats:
+                lines.append(f"    Objects: {stats['object_deleted']}")
             freed_mb = stats["bytes_freed"] / (1024 * 1024)
             lines.append(f"  Bytes freed: {freed_mb:.2f} MB")
             if stats.get("errors", 0) > 0:
diff --git a/tests/test_gc.py b/tests/test_gc.py
index 5af71a0a9..2c312bcc0 100644
--- a/tests/test_gc.py
+++ b/tests/test_gc.py
@@ -85,6 +85,61 @@ def test_returns_empty_for_dict_without_hash(self):
         assert gc._extract_content_refs({"other": "data"}) == []
 
 
+class TestUsesObjectStorage:
+    """Tests for _uses_object_storage helper function."""
+
+    def test_returns_false_for_no_adapter(self):
+        """Test that False is returned when attribute has no adapter."""
+        attr = MagicMock()
+        attr.adapter = None
+
+        assert gc._uses_object_storage(attr) is False
+
+    def test_returns_true_for_object_type(self):
+        """Test that True is returned for <object> type."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "object"
+
+        assert gc._uses_object_storage(attr) is True
+
+    def test_returns_false_for_other_types(self):
+        """Test that False is returned for non-object types."""
+        attr = MagicMock()
+        attr.adapter = MagicMock()
+        attr.adapter.type_name = "xblob"
+
+        assert gc._uses_object_storage(attr) is False
+
+
+class TestExtractObjectRefs:
+    """Tests for _extract_object_refs helper function."""
+
+    def test_returns_empty_for_none(self):
+        """Test that empty list is returned for None value."""
+        assert gc._extract_object_refs(None) == []
+
+    def test_parses_json_string(self):
+        """Test parsing JSON string with path."""
+        value = '{"path": "schema/table/objects/pk/field_abc123", "store": "mystore"}'
+        refs = gc._extract_object_refs(value)
+
+        assert len(refs) == 1
+        assert refs[0] == ("schema/table/objects/pk/field_abc123", "mystore")
+
+    def test_parses_dict_directly(self):
+        """Test parsing dict with path."""
+        value = {"path": "test/path", "store": None}
+        refs = gc._extract_object_refs(value)
+
+        assert len(refs) == 1
+        assert refs[0] == ("test/path", None)
+
+    def test_returns_empty_for_dict_without_path(self):
+        """Test that empty list is returned for dict without path key."""
+        assert gc._extract_object_refs({"other": "data"}) == []
+
+
 class TestScan:
     """Tests for scan function."""
 
@@ -93,28 +148,47 @@ def test_requires_at_least_one_schema(self):
         with pytest.raises(DataJointError, match="At least one schema must be provided"):
             gc.scan()
 
+    @patch("datajoint.gc.scan_object_references")
+    @patch("datajoint.gc.list_stored_objects")
     @patch("datajoint.gc.scan_references")
     @patch("datajoint.gc.list_stored_content")
-    def test_returns_stats(self, mock_list_stored, mock_scan_refs):
+    def test_returns_stats(self, mock_list_content, mock_scan_refs, mock_list_objects, mock_scan_objects):
         """Test that scan returns proper statistics."""
-        # Mock referenced hashes
+        # Mock content-addressed storage
         mock_scan_refs.return_value = {"hash1", "hash2"}
-
-        # Mock stored content (hash1 referenced, hash3 orphaned)
-        mock_list_stored.return_value = {
+        mock_list_content.return_value = {
             "hash1": 100,
-            "hash3": 200,
+            "hash3": 200,  # orphaned
+        }
+
+        # Mock path-addressed storage
+        mock_scan_objects.return_value = {"path/to/obj1"}
+        mock_list_objects.return_value = {
+            "path/to/obj1": 500,
+            "path/to/obj2": 300,  # orphaned
         }
 
         mock_schema = MagicMock()
         stats = gc.scan(mock_schema, store_name="test_store")
 
-        assert stats["referenced"] == 2
-        assert stats["stored"] == 2
-        assert stats["orphaned"] == 1
-        assert stats["orphaned_bytes"] == 200
+        # Content stats
+        assert stats["content_referenced"] == 2
+        assert stats["content_stored"] == 2
+        assert stats["content_orphaned"] == 1
         assert "hash3" in stats["orphaned_hashes"]
 
+        # Object stats
+        assert stats["object_referenced"] == 1
+        assert stats["object_stored"] == 2
+        assert stats["object_orphaned"] == 1
+        assert "path/to/obj2" in stats["orphaned_paths"]
+
+        # Combined totals
+        assert stats["referenced"] == 3
+        assert stats["stored"] == 4
+        assert stats["orphaned"] == 2
+        assert stats["orphaned_bytes"] == 500  # 200 content + 300 object
+
 
 class TestCollect:
     """Tests for collect function."""
@@ -128,6 +202,9 @@ def test_dry_run_does_not_delete(self, mock_scan):
             "orphaned": 1,
             "orphaned_bytes": 100,
             "orphaned_hashes": ["orphan_hash"],
+            "orphaned_paths": [],
+            "content_orphaned": 1,
+            "object_orphaned": 0,
         }
 
         mock_schema = MagicMock()
@@ -148,6 +225,9 @@ def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete
             "orphaned": 1,
             "orphaned_bytes": 100,
             "orphaned_hashes": ["orphan_hash"],
+            "orphaned_paths": [],
+            "content_orphaned": 1,
+            "object_orphaned": 0,
         }
         mock_list_stored.return_value = {"orphan_hash": 100}
         mock_delete.return_value = True
@@ -156,10 +236,38 @@ def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete
         stats = gc.collect(mock_schema, store_name="test_store", dry_run=False)
 
         assert stats["deleted"] == 1
+        assert stats["content_deleted"] == 1
         assert stats["bytes_freed"] == 100
         assert stats["dry_run"] is False
         mock_delete.assert_called_once_with("orphan_hash", "test_store")
 
+    @patch("datajoint.gc.delete_object")
+    @patch("datajoint.gc.list_stored_objects")
+    @patch("datajoint.gc.scan")
+    def test_deletes_orphaned_objects(self, mock_scan, mock_list_objects, mock_delete):
+        """Test that orphaned objects are deleted when dry_run=False."""
+        mock_scan.return_value = {
+            "referenced": 1,
+            "stored": 2,
+            "orphaned": 1,
+            "orphaned_bytes": 500,
+            "orphaned_hashes": [],
+            "orphaned_paths": ["path/to/orphan"],
+            "content_orphaned": 0,
+            "object_orphaned": 1,
+        }
+        mock_list_objects.return_value = {"path/to/orphan": 500}
+        mock_delete.return_value = True
+
+        mock_schema = MagicMock()
+        stats = gc.collect(mock_schema, store_name="test_store", dry_run=False)
+
+        assert stats["deleted"] == 1
+        assert stats["object_deleted"] == 1
+        assert stats["bytes_freed"] == 500
+        assert stats["dry_run"] is False
+        mock_delete.assert_called_once_with("path/to/orphan", "test_store")
+
 
 class TestFormatStats:
     """Tests for format_stats function."""
@@ -171,6 +279,14 @@ def test_formats_scan_stats(self):
             "stored": 15,
             "orphaned": 5,
             "orphaned_bytes": 1024 * 1024,  # 1 MB
+            "content_referenced": 6,
+            "content_stored": 8,
+            "content_orphaned": 2,
+            "content_orphaned_bytes": 512 * 1024,
+            "object_referenced": 4,
+            "object_stored": 7,
+            "object_orphaned": 3,
+            "object_orphaned_bytes": 512 * 1024,
         }
 
         result = gc.format_stats(stats)
@@ -179,6 +295,9 @@ def test_formats_scan_stats(self):
         assert "Stored in backend:      15" in result
         assert "Orphaned (unreferenced): 5" in result
         assert "1.00 MB" in result
+        # Check for detailed sections
+        assert "Content-Addressed Storage" in result
+        assert "Path-Addressed Storage" in result
 
     def test_formats_collect_stats_dry_run(self):
         """Test formatting collect statistics with dry_run."""
@@ -202,6 +321,8 @@ def test_formats_collect_stats_actual(self):
             "stored": 15,
             "orphaned": 5,
             "deleted": 3,
+            "content_deleted": 2,
+            "object_deleted": 1,
             "bytes_freed": 2 * 1024 * 1024,  # 2 MB
             "errors": 2,
             "dry_run": False,
@@ -210,5 +331,7 @@ def test_formats_collect_stats_actual(self):
         result = gc.format_stats(stats)
 
         assert "Deleted:     3" in result
+        assert "Content: 2" in result
+        assert "Objects: 1" in result
         assert "2.00 MB" in result
         assert "Errors:      2" in result

From 3fc00ee539352b7bd0ea5c44e5f8e14e4ad9d18b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 26 Dec 2025 03:30:30 +0000
Subject: [PATCH 40/41] Move EXTERNAL_TABLE_ROOT to external.py (deprecated)

External tables are deprecated in favor of the new storage type system.
Move the constant to external.py where it's used, keeping declare.py clean.

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/external.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/datajoint/external.py b/src/datajoint/external.py
index 06e76af37..81230fb95 100644
--- a/src/datajoint/external.py
+++ b/src/datajoint/external.py
@@ -5,15 +5,18 @@
 
 from tqdm import tqdm
 
-from .declare import EXTERNAL_TABLE_ROOT
 from .errors import DataJointError, MissingExternalFile
 from .hash import uuid_from_buffer, uuid_from_file
 from .heading import Heading
 from .settings import config
 from .storage import StorageBackend
 from .table import FreeTable, Table
+
 from .utils import safe_write
 
+# External table name root (deprecated - external tables are being phased out)
+EXTERNAL_TABLE_ROOT = "~external"
+
 logger = logging.getLogger(__name__.split(".")[0])
 
 CACHE_SUBFOLDING = (

From b4512c9fd7289e911d7c93056495fa3ad79264e1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 26 Dec 2025 03:35:50 +0000
Subject: [PATCH 41/41] Remove deprecated external.py module

External tables (~external_*) are deprecated in favor of the new
AttributeType-based storage system. The new types (<xblob>, <content>,
<object>) store data directly to storage via StorageBackend without
tracking tables.

- Remove src/datajoint/external.py entirely
- Remove ExternalMapping from schemas.py
- Remove external table pre-declaration from table.py

Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
---
 src/datajoint/external.py | 455 --------------------------------------
 src/datajoint/schemas.py  |   2 -
 src/datajoint/table.py    |  12 +-
 3 files changed, 3 insertions(+), 466 deletions(-)
 delete mode 100644 src/datajoint/external.py

diff --git a/src/datajoint/external.py b/src/datajoint/external.py
deleted file mode 100644
index 81230fb95..000000000
--- a/src/datajoint/external.py
+++ /dev/null
@@ -1,455 +0,0 @@
-import logging
-import warnings
-from collections.abc import Mapping
-from pathlib import Path, PurePosixPath, PureWindowsPath
-
-from tqdm import tqdm
-
-from .errors import DataJointError, MissingExternalFile
-from .hash import uuid_from_buffer, uuid_from_file
-from .heading import Heading
-from .settings import config
-from .storage import StorageBackend
-from .table import FreeTable, Table
-
-from .utils import safe_write
-
-# External table name root (deprecated - external tables are being phased out)
-EXTERNAL_TABLE_ROOT = "~external"
-
-logger = logging.getLogger(__name__.split(".")[0])
-
-CACHE_SUBFOLDING = (
-    2,
-    2,
-)  # (2, 2) means  "0123456789abcd" will be saved as "01/23/0123456789abcd"
-SUPPORT_MIGRATED_BLOBS = True  # support blobs migrated from datajoint 0.11.*
-
-
-def subfold(name, folds):
-    """
-    subfolding for external storage: e.g.  subfold('aBCdefg', (2, 3))  -->  ['ab','cde']
-    """
-    return (name[: folds[0]].lower(),) + subfold(name[folds[0] :], folds[1:]) if folds else ()
-
-
-class ExternalTable(Table):
-    """
-    The table tracking externally stored objects.
-    Declare as ExternalTable(connection, database)
-    """
-
-    def __init__(self, connection, store, database):
-        self.store = store
-        self.database = database
-        self._connection = connection
-        self._heading = Heading(
-            table_info=dict(
-                conn=connection,
-                database=database,
-                table_name=self.table_name,
-                context=None,
-            )
-        )
-        self._support = [self.full_table_name]
-        if not self.is_declared:
-            self.declare()
-        # Initialize storage backend (validates configuration)
-        self.storage = StorageBackend(config.get_store_spec(store))
-
-    @property
-    def definition(self):
-        return """
-        # external storage tracking
-        hash  : uuid    #  hash of contents (blob), of filename + contents (attach), or relative filepath (filepath)
-        ---
-        size      :bigint unsigned     # size of object in bytes
-        attachment_name=null : varchar(255)  # the filename of an attachment
-        filepath=null : varchar(1000)  # relative filepath or attachment filename
-        contents_hash=null : uuid      # used for the filepath datatype
-        timestamp=CURRENT_TIMESTAMP  :timestamp   # automatic timestamp
-        """
-
-    @property
-    def table_name(self):
-        return f"{EXTERNAL_TABLE_ROOT}_{self.store}"
-
-    @property
-    def s3(self):
-        """Deprecated: Use storage property instead."""
-        warnings.warn(
-            "ExternalTable.s3 is deprecated. Use ExternalTable.storage instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        # For backward compatibility, return a legacy s3.Folder if needed
-        from . import s3
-
-        if not hasattr(self, "_s3_legacy") or self._s3_legacy is None:
-            self._s3_legacy = s3.Folder(**self.storage.spec)
-        return self._s3_legacy
-
-    # - low-level operations - private
-
-    def _make_external_filepath(self, relative_filepath):
-        """resolve the complete external path based on the relative path"""
-        spec = self.storage.spec
-        # Strip root for S3 paths
-        if spec["protocol"] == "s3":
-            posix_path = PurePosixPath(PureWindowsPath(spec["location"]))
-            location_path = (
-                Path(*posix_path.parts[1:])
-                if len(spec["location"]) > 0 and any(case in posix_path.parts[0] for case in ("\\", ":"))
-                else Path(posix_path)
-            )
-            return PurePosixPath(location_path, relative_filepath)
-        # Preserve root for local filesystem
-        elif spec["protocol"] == "file":
-            return PurePosixPath(Path(spec["location"]), relative_filepath)
-        else:
-            # For other protocols (gcs, azure, etc.), treat like S3
-            location = spec.get("location", "")
-            return PurePosixPath(location, relative_filepath) if location else PurePosixPath(relative_filepath)
-
-    def _make_uuid_path(self, uuid, suffix=""):
-        """create external path based on the uuid hash"""
-        return self._make_external_filepath(
-            PurePosixPath(
-                self.database,
-                "/".join(subfold(uuid.hex, self.storage.spec["subfolding"])),
-                uuid.hex,
-            ).with_suffix(suffix)
-        )
-
-    def _upload_file(self, local_path, external_path, metadata=None):
-        """Upload a file to external storage using fsspec backend."""
-        self.storage.put_file(local_path, external_path, metadata)
-
-    def _download_file(self, external_path, download_path):
-        """Download a file from external storage using fsspec backend."""
-        self.storage.get_file(external_path, download_path)
-
-    def _upload_buffer(self, buffer, external_path):
-        """Upload bytes to external storage using fsspec backend."""
-        self.storage.put_buffer(buffer, external_path)
-
-    def _download_buffer(self, external_path):
-        """Download bytes from external storage using fsspec backend."""
-        return self.storage.get_buffer(external_path)
-
-    def _remove_external_file(self, external_path):
-        """Remove a file from external storage using fsspec backend."""
-        self.storage.remove(external_path)
-
-    def exists(self, external_filepath):
-        """
-        Check if an external file is accessible using fsspec backend.
-
-        :return: True if the external file is accessible
-        """
-        return self.storage.exists(external_filepath)
-
-    # --- BLOBS ----
-
-    def put(self, blob):
-        """
-        put a binary string (blob) in external store
-        """
-        uuid = uuid_from_buffer(blob)
-        self._upload_buffer(blob, self._make_uuid_path(uuid))
-        # insert tracking info
-        self.connection.query(
-            "INSERT INTO {tab} (hash, size) VALUES (%s, {size}) ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP".format(
-                tab=self.full_table_name, size=len(blob)
-            ),
-            args=(uuid.bytes,),
-        )
-        return uuid
-
-    def get(self, uuid):
-        """
-        get an object from external store.
-        """
-        if uuid is None:
-            return None
-        # attempt to get object from cache
-        blob = None
-        cache_folder = config.get("cache", None)
-        if cache_folder:
-            try:
-                cache_path = Path(cache_folder, *subfold(uuid.hex, CACHE_SUBFOLDING))
-                cache_file = Path(cache_path, uuid.hex)
-                blob = cache_file.read_bytes()
-            except FileNotFoundError:
-                pass  # not cached
-        # download blob from external store
-        if blob is None:
-            try:
-                blob = self._download_buffer(self._make_uuid_path(uuid))
-            except MissingExternalFile:
-                if not SUPPORT_MIGRATED_BLOBS:
-                    raise
-                # blobs migrated from datajoint 0.11 are stored at explicitly defined filepaths
-                relative_filepath, contents_hash = (self & {"hash": uuid}).fetch1("filepath", "contents_hash")
-                if relative_filepath is None:
-                    raise
-                blob = self._download_buffer(self._make_external_filepath(relative_filepath))
-            if cache_folder:
-                cache_path.mkdir(parents=True, exist_ok=True)
-                safe_write(cache_path / uuid.hex, blob)
-        return blob
-
-    # --- ATTACHMENTS ---
-
-    def upload_attachment(self, local_path):
-        attachment_name = Path(local_path).name
-        uuid = uuid_from_file(local_path, init_string=attachment_name + "\0")
-        external_path = self._make_uuid_path(uuid, "." + attachment_name)
-        self._upload_file(local_path, external_path)
-        # insert tracking info
-        self.connection.query(
-            """
-        INSERT INTO {tab} (hash, size, attachment_name)
-        VALUES (%s, {size}, "{attachment_name}")
-        ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP""".format(
-                tab=self.full_table_name,
-                size=Path(local_path).stat().st_size,
-                attachment_name=attachment_name,
-            ),
-            args=[uuid.bytes],
-        )
-        return uuid
-
-    def get_attachment_name(self, uuid):
-        return (self & {"hash": uuid}).fetch1("attachment_name")
-
-    def download_attachment(self, uuid, attachment_name, download_path):
-        """save attachment from memory buffer into the save_path"""
-        external_path = self._make_uuid_path(uuid, "." + attachment_name)
-        self._download_file(external_path, download_path)
-
-    # --- FILEPATH ---
-
-    def upload_filepath(self, local_filepath):
-        """
-        Raise exception if an external entry already exists with a different contents checksum.
-        Otherwise, copy (with overwrite) file to remote and
-        If an external entry exists with the same checksum, then no copying should occur
-        """
-        local_filepath = Path(local_filepath)
-        try:
-            relative_filepath = str(local_filepath.relative_to(self.storage.spec["stage"]).as_posix())
-        except ValueError:
-            raise DataJointError(f"The path {local_filepath.parent} is not in stage {self.storage.spec['stage']}")
-        uuid = uuid_from_buffer(init_string=relative_filepath)  # hash relative path, not contents
-        contents_hash = uuid_from_file(local_filepath)
-
-        # check if the remote file already exists and verify that it matches
-        check_hash = (self & {"hash": uuid}).fetch("contents_hash")
-        if check_hash.size:
-            # the tracking entry exists, check that it's the same file as before
-            if contents_hash != check_hash[0]:
-                raise DataJointError(f"A different version of '{relative_filepath}' has already been placed.")
-        else:
-            # upload the file and create its tracking entry
-            self._upload_file(
-                local_filepath,
-                self._make_external_filepath(relative_filepath),
-                metadata={"contents_hash": str(contents_hash)},
-            )
-            self.connection.query(
-                "INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format(
-                    tab=self.full_table_name,
-                    size=Path(local_filepath).stat().st_size,
-                    filepath=relative_filepath,
-                ),
-                args=(uuid.bytes, contents_hash.bytes),
-            )
-        return uuid
-
-    def download_filepath(self, filepath_hash):
-        """
-        sync a file from external store to the local stage
-
-        :param filepath_hash: The hash (UUID) of the relative_path
-        :return: hash (UUID) of the contents of the downloaded file or Nones
-        """
-
-        def _need_checksum(local_filepath, expected_size):
-            limit = config.get("filepath_checksum_size_limit")
-            actual_size = Path(local_filepath).stat().st_size
-            if expected_size != actual_size:
-                # this should never happen without outside interference
-                raise DataJointError(f"'{local_filepath}' downloaded but size did not match.")
-            return limit is None or actual_size < limit
-
-        if filepath_hash is not None:
-            relative_filepath, contents_hash, size = (self & {"hash": filepath_hash}).fetch1(
-                "filepath", "contents_hash", "size"
-            )
-            external_path = self._make_external_filepath(relative_filepath)
-            local_filepath = Path(self.storage.spec["stage"]).absolute() / relative_filepath
-
-            file_exists = Path(local_filepath).is_file() and (
-                not _need_checksum(local_filepath, size) or uuid_from_file(local_filepath) == contents_hash
-            )
-
-            if not file_exists:
-                self._download_file(external_path, local_filepath)
-                if _need_checksum(local_filepath, size) and uuid_from_file(local_filepath) != contents_hash:
-                    # this should never happen without outside interference
-                    raise DataJointError(f"'{local_filepath}' downloaded but did not pass checksum.")
-            if not _need_checksum(local_filepath, size):
-                logger.warning(f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}")
-            return str(local_filepath), contents_hash
-
-    # --- UTILITIES ---
-
-    @property
-    def references(self):
-        """
-        :return: generator of referencing table names and their referencing columns
-        """
-        return (
-            {k.lower(): v for k, v in elem.items()}
-            for elem in self.connection.query(
-                """
-        SELECT concat('`', table_schema, '`.`', table_name, '`') as referencing_table, column_name
-        FROM information_schema.key_column_usage
-        WHERE referenced_table_name="{tab}" and referenced_table_schema="{db}"
-        """.format(tab=self.table_name, db=self.database),
-                as_dict=True,
-            )
-        )
-
-    def fetch_external_paths(self, **fetch_kwargs):
-        """
-        generate complete external filepaths from the query.
-        Each element is a tuple: (uuid, path)
-
-        :param fetch_kwargs: keyword arguments to pass to fetch
-        """
-        fetch_kwargs.update(as_dict=True)
-        paths = []
-        for item in self.fetch("hash", "attachment_name", "filepath", **fetch_kwargs):
-            if item["attachment_name"]:
-                # attachments
-                path = self._make_uuid_path(item["hash"], "." + item["attachment_name"])
-            elif item["filepath"]:
-                # external filepaths
-                path = self._make_external_filepath(item["filepath"])
-            else:
-                # blobs
-                path = self._make_uuid_path(item["hash"])
-            paths.append((item["hash"], path))
-        return paths
-
-    def unused(self):
-        """
-        query expression for unused hashes
-
-        :return: self restricted to elements that are not in use by any tables in the schema
-        """
-        return self - [
-            FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references
-        ]
-
-    def used(self):
-        """
-        query expression for used hashes
-
-        :return: self restricted to elements that in use by tables in the schema
-        """
-        return self & [
-            FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references
-        ]
-
-    def delete(
-        self,
-        *,
-        delete_external_files=None,
-        limit=None,
-        display_progress=True,
-        errors_as_string=True,
-    ):
-        """
-
-        :param delete_external_files: True or False. If False, only the tracking info is removed from the external
-                store table but the external files remain intact. If True, then the external files themselves are deleted too.
-        :param errors_as_string: If True any errors returned when deleting from external files will be strings
-        :param limit: (integer) limit the number of items to delete
-        :param display_progress: if True, display progress as files are cleaned up
-        :return: if deleting external files, returns errors
-        """
-        if delete_external_files not in (True, False):
-            raise DataJointError("The delete_external_files argument must be set to either True or False in delete()")
-
-        if not delete_external_files:
-            self.unused().delete_quick()
-        else:
-            items = self.unused().fetch_external_paths(limit=limit)
-            if display_progress:
-                items = tqdm(items)
-            # delete items one by one, close to transaction-safe
-            error_list = []
-            for uuid, external_path in items:
-                row = (self & {"hash": uuid}).fetch()
-                if row.size:
-                    try:
-                        (self & {"hash": uuid}).delete_quick()
-                    except Exception:
-                        pass  # if delete failed, do not remove the external file
-                    else:
-                        try:
-                            self._remove_external_file(external_path)
-                        except Exception as error:
-                            # adding row back into table after failed delete
-                            self.insert1(row[0], skip_duplicates=True)
-                            error_list.append(
-                                (
-                                    uuid,
-                                    external_path,
-                                    str(error) if errors_as_string else error,
-                                )
-                            )
-            return error_list
-
-
-class ExternalMapping(Mapping):
-    """
-    The external manager contains all the tables for all external stores for a given schema
-    :Example:
-        e = ExternalMapping(schema)
-        external_table = e[store]
-    """
-
-    def __init__(self, schema):
-        self.schema = schema
-        self._tables = {}
-
-    def __repr__(self):
-        return "External file tables for schema `{schema}`:\n    ".format(schema=self.schema.database) + "\n    ".join(
-            '"{store}" {protocol}:{location}'.format(store=k, **v.spec) for k, v in self.items()
-        )
-
-    def __getitem__(self, store):
-        """
-        Triggers the creation of an external table.
-        Should only be used when ready to save or read from external storage.
-
-        :param store: the name of the store
-        :return: the ExternalTable object for the store
-        """
-        if store not in self._tables:
-            self._tables[store] = ExternalTable(
-                connection=self.schema.connection,
-                store=store,
-                database=self.schema.database,
-            )
-        return self._tables[store]
-
-    def __len__(self):
-        return len(self._tables)
-
-    def __iter__(self):
-        return iter(self._tables)
diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py
index e9b83efff..0b42f0104 100644
--- a/src/datajoint/schemas.py
+++ b/src/datajoint/schemas.py
@@ -8,7 +8,6 @@
 
 from .connection import conn
 from .errors import AccessError, DataJointError
-from .external import ExternalMapping
 from .heading import Heading
 from .jobs import JobTable
 from .settings import config
@@ -71,7 +70,6 @@ def __init__(
         self.create_schema = create_schema
         self.create_tables = create_tables
         self._jobs = None
-        self.external = ExternalMapping(self)
         self.add_objects = add_objects
         self.declare_list = []
         if schema_name:
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
index 009d475d2..dce1e70ab 100644
--- a/src/datajoint/table.py
+++ b/src/datajoint/table.py
@@ -102,12 +102,9 @@ def declare(self, context=None):
                 "Table class name `{name}` is invalid. Please use CamelCase. ".format(name=self.class_name)
                 + "Classes defining tables should be formatted in strict CamelCase."
             )
-        sql, external_stores = declare(self.full_table_name, self.definition, context)
+        sql, _external_stores = declare(self.full_table_name, self.definition, context)
         sql = sql.format(database=self.database)
         try:
-            # declare all external tables before declaring main table
-            for store in external_stores:
-                self.connection.schemas[self.database].external[store]
             self.connection.query(sql)
         except AccessError:
             # skip if no create privilege
@@ -126,7 +123,7 @@ def alter(self, prompt=True, context=None):
             context = dict(frame.f_globals, **frame.f_locals)
             del frame
         old_definition = self.describe(context=context)
-        sql, external_stores = alter(self.definition, old_definition, context)
+        sql, _external_stores = alter(self.definition, old_definition, context)
         if not sql:
             if prompt:
                 logger.warning("Nothing to alter.")
@@ -134,9 +131,6 @@ def alter(self, prompt=True, context=None):
             sql = "ALTER TABLE {tab}\n\t".format(tab=self.full_table_name) + ",\n\t".join(sql)
             if not prompt or user_choice(sql + "\n\nExecute?") == "yes":
                 try:
-                    # declare all external tables before declaring main table
-                    for store in external_stores:
-                        self.connection.schemas[self.database].external[store]
                     self.connection.query(sql)
                 except AccessError:
                     # skip if no create privilege
@@ -351,7 +345,7 @@ def _process_object_value(self, name: str, value, row: dict, store_name: str | N
                 size = source_path.stat().st_size
         else:
             raise DataJointError(
-                f"Invalid value type for object attribute {name}. " "Expected file path, folder path, or (ext, stream) tuple."
+                f"Invalid value type for object attribute {name}. Expected file path, folder path, or (ext, stream) tuple."
             )
 
         # Get storage spec for path building