From c4adb748dc51fc945ac389164a4320c4f6c7d11c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 6 Feb 2026 15:54:58 +0100 Subject: [PATCH 1/6] WIP, many open quiestions. Have to start testing to see the real situations --- src/apify/_actor.py | 4 +++ src/apify/_configuration.py | 20 +++++++++-- .../_apify/_alias_resolving.py | 36 +++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 4fd2989e..ca934838 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -36,6 +36,7 @@ from apify.events import ApifyEventManager, EventManager, LocalEventManager from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient, SmartApifyStorageClient +from apify.storage_clients._apify._alias_resolving import AliasResolver from apify.storage_clients._file_system import ApifyFileSystemStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -203,6 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() + + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self._configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 856c5fd0..264eb145 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -5,9 +5,9 @@ from decimal import Decimal from logging import getLogger from pathlib import Path -from typing import Annotated, Any +from typing import Annotated, Any, Required, TypedDict -from pydantic import AliasChoices, BeforeValidator, Field, model_validator +from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, field_validator, model_validator from typing_extensions import Self, deprecated from crawlee import service_locator @@ -34,6 +34,13 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') +class ActorStorageIds(TypedDict): + """Storage IDs for different storage types used by an Actor.""" + keyValueStores: dict[str, str] + datasets: dict[str, str] + requestQueues: dict[str, str] + + @docs_group('Configuration') class Configuration(CrawleeConfiguration): """A class for specifying the configuration of an Actor. @@ -446,6 +453,15 @@ class Configuration(CrawleeConfiguration): BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), ] = None + actor_storage_ids: Annotated[ + ActorStorageIds | None, + Field( + alias='apify_actor_storage_ids', + description='Storage IDs for the actor', + ), + BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), + ] = None + @model_validator(mode='after') def disable_browser_sandbox_on_platform(self) -> Self: """Disable the browser sandbox mode when running on the Apify platform. diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index e357333f..54cedb90 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -262,3 +262,39 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore raise ValueError("'Configuration.default_key_value_store_id' must be set.") return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) + + + @classmethod + async def register_aliases(cls, configuration: Configuration) -> None: + """Load alias mapping from dictionary to the default kvs.""" + def convert_name(name: str): + """Convert from mapping name to storage type name used in the alias mapping.""" + return {"datasets": "Dataset", "keyValueStores": "KeyValueStore", "requestQueues": "RequestQueue"}[name] + + configuration_mapping = {} + + if configuration.default_dataset_id != configuration.actor_storage_ids["datasets"].get("default", configuration.default_dataset_id): + raise RuntimeError( + f"Conflicting default dataset ids: {configuration.default_dataset_id=}," + f" {configuration.actor_storage_ids['datasets'].get('default')=}") + + for config_storage_type, mapping in configuration.actor_storage_ids.items(): + for storage_alias, storage_id in mapping.items(): + if storage_alias == "default": + # This is how the default storage is stored in the default kvs + storage_alias="__default__" + + configuration_mapping[AliasResolver( + storage_type=convert_name(config_storage_type), + alias=storage_alias, + configuration=configuration, + )._storage_key] = storage_id + + # Aliased storage can be also default storage!!! + # Should we store such second alias to the default storage or ignore it in such case? Probably + + # What if existing default dataset already has conflicting keys? + # Just override it, that will teach it to have conflicting values! + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = (await client.get_record(cls._ALIAS_MAPPING_KEY) or {"value":{}}).get("value") + await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) From 19113e709e1d3f4c576aca6a1992d23006a5830c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 13 Feb 2026 16:08:15 +0100 Subject: [PATCH 2/6] Add debug --- src/apify/_actor.py | 2 ++ src/apify/_configuration.py | 4 ++-- src/apify/storage_clients/_apify/_alias_resolving.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index ca934838..7f4172dc 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os import sys import warnings from contextlib import suppress @@ -206,6 +207,7 @@ async def __aenter__(self) -> Self: await self.open_key_value_store() # Load non-default aliased storages from configuration + self.log.warning('\n'.join(f'{k}={v}' for k, v in os.environ.items())) await AliasResolver.register_aliases(configuration=self._configuration) return self diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 264eb145..2469310e 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -453,10 +453,10 @@ class Configuration(CrawleeConfiguration): BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), ] = None - actor_storage_ids: Annotated[ + actor_storages: Annotated[ ActorStorageIds | None, Field( - alias='apify_actor_storage_ids', + alias='actor_storages_json', description='Storage IDs for the actor', ), BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 54cedb90..06e58b40 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -273,12 +273,12 @@ def convert_name(name: str): configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storage_ids["datasets"].get("default", configuration.default_dataset_id): + if configuration.default_dataset_id != configuration.actor_storages["datasets"].get("default", configuration.default_dataset_id): raise RuntimeError( f"Conflicting default dataset ids: {configuration.default_dataset_id=}," - f" {configuration.actor_storage_ids['datasets'].get('default')=}") + f" {configuration.actor_storages['datasets'].get('default')=}") - for config_storage_type, mapping in configuration.actor_storage_ids.items(): + for config_storage_type, mapping in configuration.actor_storages.items(): for storage_alias, storage_id in mapping.items(): if storage_alias == "default": # This is how the default storage is stored in the default kvs From b12e27ee3418a7372da3c7bfa7bb4fc1fcb0899c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 16 Feb 2026 15:50:54 +0100 Subject: [PATCH 3/6] Fix type issues, prepare for tests. Merge first --- src/apify/_actor.py | 3 +- src/apify/_configuration.py | 30 ++++++++++--- .../_apify/_alias_resolving.py | 44 ++++++++++--------- tests/unit/actor/test_configuration.py | 2 + 4 files changed, 49 insertions(+), 30 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 7f4172dc..ce74c688 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -207,8 +207,7 @@ async def __aenter__(self) -> Self: await self.open_key_value_store() # Load non-default aliased storages from configuration - self.log.warning('\n'.join(f'{k}={v}' for k, v in os.environ.items())) - await AliasResolver.register_aliases(configuration=self._configuration) + #await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 2469310e..5b02115d 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,13 +1,14 @@ from __future__ import annotations +import dataclasses import json from datetime import datetime, timedelta from decimal import Decimal from logging import getLogger from pathlib import Path -from typing import Annotated, Any, Required, TypedDict +from typing import Annotated, Any -from pydantic import AliasChoices, BaseModel, BeforeValidator, Field, field_validator, model_validator +from pydantic import AliasChoices, BeforeValidator, Field, model_validator from typing_extensions import Self, deprecated from crawlee import service_locator @@ -34,11 +35,26 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') -class ActorStorageIds(TypedDict): +@dataclasses.dataclass +class ActorStorages: """Storage IDs for different storage types used by an Actor.""" - keyValueStores: dict[str, str] + + key_value_stores: dict[str, str] datasets: dict[str, str] - requestQueues: dict[str, str] + request_queues: dict[str, str] + + +def _load_storage_keys(data: None | str | dict) -> ActorStorages | None: + """Load storage keys from environment.""" + if data is None: + return None + + storage_mapping = data if isinstance(data, dict) else json.loads(data) + return ActorStorages( + key_value_stores=storage_mapping.get('keyValueStores', {}), + datasets=storage_mapping.get('datasets', {}), + request_queues=storage_mapping.get('requestQueues', {}), + ) @docs_group('Configuration') @@ -454,12 +470,12 @@ class Configuration(CrawleeConfiguration): ] = None actor_storages: Annotated[ - ActorStorageIds | None, + ActorStorages | None, Field( alias='actor_storages_json', description='Storage IDs for the actor', ), - BeforeValidator(lambda data: json.loads(data) if isinstance(data, str) else data or None), + BeforeValidator(_load_storage_keys), ] = None @model_validator(mode='after') diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 06e58b40..546fbd14 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -263,32 +263,33 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) - @classmethod async def register_aliases(cls, configuration: Configuration) -> None: - """Load alias mapping from dictionary to the default kvs.""" - def convert_name(name: str): - """Convert from mapping name to storage type name used in the alias mapping.""" - return {"datasets": "Dataset", "keyValueStores": "KeyValueStore", "requestQueues": "RequestQueue"}[name] - + """Load any alias mapping from configuration to the default kvs.""" + if configuration.actor_storages is None: + return configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storages["datasets"].get("default", configuration.default_dataset_id): - raise RuntimeError( - f"Conflicting default dataset ids: {configuration.default_dataset_id=}," - f" {configuration.actor_storages['datasets'].get('default')=}") + if configuration.default_dataset_id != configuration.actor_storages.datasets.get( + 'default'): + logger.warning( + f'Conflicting default dataset ids: {configuration.default_dataset_id=},' + f" {configuration.actor_storages.datasets.get('default')=}" + ) - for config_storage_type, mapping in configuration.actor_storages.items(): + for mapping, storage_type in ( + (configuration.actor_storages.key_value_stores, 'KeyValueStore'), + (configuration.actor_storages.datasets, 'Dataset'), + (configuration.actor_storages.request_queues, 'RequestQueue'), + ): for storage_alias, storage_id in mapping.items(): - if storage_alias == "default": - # This is how the default storage is stored in the default kvs - storage_alias="__default__" - - configuration_mapping[AliasResolver( - storage_type=convert_name(config_storage_type), - alias=storage_alias, - configuration=configuration, - )._storage_key] = storage_id + configuration_mapping[ + cls( # noqa: SLF001# It is ok in own classmethod. + storage_type=storage_type, + alias='__default__' if storage_alias == 'default' else storage_alias, + configuration=configuration, + )._storage_key + ] = storage_id # Aliased storage can be also default storage!!! # Should we store such second alias to the default storage or ignore it in such case? Probably @@ -296,5 +297,6 @@ def convert_name(name: str): # What if existing default dataset already has conflicting keys? # Just override it, that will teach it to have conflicting values! client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = (await client.get_record(cls._ALIAS_MAPPING_KEY) or {"value":{}}).get("value") + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', + {}) await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index 7f01c48e..e36771fa 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -7,6 +8,7 @@ from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.crawlers import BasicCrawler from crawlee.errors import ServiceConflictError +from crawlee.storage_clients import MemoryStorageClient from apify import Actor from apify import Configuration as ApifyConfiguration From 3b36459186da3c214183270a595aea98e0d974d0 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Feb 2026 09:47:42 +0100 Subject: [PATCH 4/6] Adapt to new test structure --- src/apify/_actor.py | 7 +++-- .../_apify/_alias_resolving.py | 14 +++------- tests/e2e/test_schema_storages/__init__.py | 0 .../actor_source/actor.json | 24 +++++++++++++++++ .../test_schema_storages/actor_source/main.py | 7 +++++ .../test_schema_storages.py | 26 +++++++++++++++++++ tests/unit/actor/test_configuration.py | 2 -- 7 files changed, 64 insertions(+), 16 deletions(-) create mode 100644 tests/e2e/test_schema_storages/__init__.py create mode 100644 tests/e2e/test_schema_storages/actor_source/actor.json create mode 100644 tests/e2e/test_schema_storages/actor_source/main.py create mode 100644 tests/e2e/test_schema_storages/test_schema_storages.py diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 3319fe0b..5bb91149 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import os import sys import warnings from contextlib import suppress @@ -205,9 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - - # Load non-default aliased storages from configuration - #await AliasResolver.register_aliases(configuration=self.configuration) + else: + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 546fbd14..aa6dcb47 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -265,13 +265,13 @@ async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStore @classmethod async def register_aliases(cls, configuration: Configuration) -> None: - """Load any alias mapping from configuration to the default kvs.""" + """Load alias mapping from configuration to the default kvs.""" if configuration.actor_storages is None: return + configuration_mapping = {} - if configuration.default_dataset_id != configuration.actor_storages.datasets.get( - 'default'): + if configuration.default_dataset_id != configuration.actor_storages.datasets.get('default'): logger.warning( f'Conflicting default dataset ids: {configuration.default_dataset_id=},' f" {configuration.actor_storages.datasets.get('default')=}" @@ -291,12 +291,6 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - # Aliased storage can be also default storage!!! - # Should we store such second alias to the default storage or ignore it in such case? Probably - - # What if existing default dataset already has conflicting keys? - # Just override it, that will teach it to have conflicting values! client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', - {}) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) diff --git a/tests/e2e/test_schema_storages/__init__.py b/tests/e2e/test_schema_storages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e/test_schema_storages/actor_source/actor.json b/tests/e2e/test_schema_storages/actor_source/actor.json new file mode 100644 index 00000000..f83b04ec --- /dev/null +++ b/tests/e2e/test_schema_storages/actor_source/actor.json @@ -0,0 +1,24 @@ +{ + "actorSpecification": 1, + "version": "0.0", + "storages": { + "datasets": { + "default": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + }, + "custom": { + "actorSpecification": 1, + "fields": { + "properties": { + "id": { "type": "string" } + } + } + } + } + } +} diff --git a/tests/e2e/test_schema_storages/actor_source/main.py b/tests/e2e/test_schema_storages/actor_source/main.py new file mode 100644 index 00000000..ebed9ba4 --- /dev/null +++ b/tests/e2e/test_schema_storages/actor_source/main.py @@ -0,0 +1,7 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + assert Actor.configuration.actor_storages + assert (await Actor.open_dataset(alias='custom')).id == Actor.configuration.actor_storages.datasets['custom'] diff --git a/tests/e2e/test_schema_storages/test_schema_storages.py b/tests/e2e/test_schema_storages/test_schema_storages.py new file mode 100644 index 00000000..8ad98b7a --- /dev/null +++ b/tests/e2e/test_schema_storages/test_schema_storages.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' + + +def read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() + + +async def test_configuration_storages(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='schema_storages', + source_files={ + 'src/main.py': read_actor_source('main.py'), + '.actor/actor.json': read_actor_source('actor.json'), + }, + ) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index b1a201ac..ccd20849 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -1,4 +1,3 @@ -import json from pathlib import Path import pytest @@ -8,7 +7,6 @@ from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.crawlers import BasicCrawler from crawlee.errors import ServiceConflictError -from crawlee.storage_clients import MemoryStorageClient from apify import Actor from apify import Configuration as ApifyConfiguration From 72c2f353199b618cdcb3d304a3250302abdc2d11 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Feb 2026 16:57:36 +0100 Subject: [PATCH 5/6] Add uni tests, WIP TODO - how should it behave locally? --- src/apify/_actor.py | 6 ++-- src/apify/_configuration.py | 5 +-- .../_apify/_alias_resolving.py | 15 +++++++-- tests/unit/actor/test_configuration.py | 23 +++++++++++++ .../storage_clients/test_alias_resolver.py | 33 ++++++++++++++++++- 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 5bb91149..d23de91a 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -204,9 +204,9 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - else: - # Load non-default aliased storages from configuration - await AliasResolver.register_aliases(configuration=self.configuration) + + # Load non-default aliased storages from configuration + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 5b02115d..00bc89df 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -44,11 +44,12 @@ class ActorStorages: request_queues: dict[str, str] -def _load_storage_keys(data: None | str | dict) -> ActorStorages | None: +def _load_storage_keys(data: None | str | dict | ActorStorages) -> ActorStorages | None: """Load storage keys from environment.""" if data is None: return None - + if isinstance(data, ActorStorages): + return data storage_mapping = data if isinstance(data, dict) else json.loads(data) return ActorStorages( key_value_stores=storage_mapping.get('keyValueStores', {}), diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index aa6dcb47..58da173b 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -291,6 +291,15 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) - await client.set_record(cls._ALIAS_MAPPING_KEY, {**existing_mapping, **configuration_mapping}) + if configuration.is_at_home: + # Bulk update the mapping in the default KVS with the configuration mapping. + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) + # Update the existing mapping with the configuration mapping. + existing_mapping.update(configuration_mapping) + # Store the updated mapping back in the KVS and in memory. + await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) + cls._alias_map = existing_mapping + else: + # Update only in-memory mapping when not at home + cls._alias_map.update(configuration_mapping) diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index ccd20849..e59fa5b9 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -300,3 +300,26 @@ def test_actor_pricing_info_from_json_env_var(monkeypatch: pytest.MonkeyPatch) - config = ApifyConfiguration() assert config.actor_pricing_info is not None assert config.actor_pricing_info.pricing_model == 'PAY_PER_EVENT' + + +def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that actor_storages_json is parsed from JSON env var.""" + import json + + datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + + actor_storages_json = json.dumps( + { + 'datasets': datasets, + 'requestQueues': request_queues, + 'keyValueStores': key_value_stores, + } + ) + monkeypatch.setenv('ACTOR_STORAGES_JSON', actor_storages_json) + config = ApifyConfiguration() + assert config.actor_storages + assert config.actor_storages.datasets == datasets + assert config.actor_storages.request_queues == request_queues + assert config.actor_storages.key_value_stores == key_value_stores diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 56821ca1..3cfa0fa3 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,6 +1,10 @@ from __future__ import annotations -from apify._configuration import Configuration +from crawlee import service_locator + +from apify import Actor +from apify._configuration import Configuration, ActorStorages +from apify.storage_clients import SmartApifyStorageClient, ApifyStorageClient from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -76,3 +80,30 @@ async def test_get_alias_map_returns_in_memory_map() -> None: AliasResolver._alias_map = {} result = await AliasResolver._get_alias_map(config) assert result == {} + + +async def test_register_aliases() -> None: + """Test that _get_alias_map loads the map from KVS when at home. + + AliasResolver works locally only """ + + + datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + + config = Configuration(is_at_home=False, + token='test-token', + actor_storages= ActorStorages( + datasets = datasets, + request_queues = request_queues, + key_value_stores = key_value_stores + ), + ) + storage_client = ApifyStorageClient() + service_locator.set_storage_client( + SmartApifyStorageClient(local_storage_client=storage_client, cloud_storage_client=storage_client) + ) + async with Actor(configuration=config): + d = await Actor.open_dataset(alias='default') + assert d From b7604cba954bcbd7c2373928702672810bf3732c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Feb 2026 13:26:06 +0100 Subject: [PATCH 6/6] Finalize tests --- src/apify/_actor.py | 7 +-- .../_apify/_alias_resolving.py | 21 ++++---- tests/integration/test_storages.py | 53 +++++++++++++++++++ tests/unit/actor/test_configuration.py | 6 +-- .../storage_clients/test_alias_resolver.py | 33 +----------- 5 files changed, 70 insertions(+), 50 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index d23de91a..7e3da0ba 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -204,9 +204,10 @@ async def __aenter__(self) -> Self: if not Actor.is_at_home(): # Make sure that the input related KVS is initialized to ensure that the input aware client is used await self.open_key_value_store() - - # Load non-default aliased storages from configuration - await AliasResolver.register_aliases(configuration=self.configuration) + else: + # Load pre-existing non-default aliased storages from configuration + # Supported only on the Apify platform, where those storages are pre-created by the platform. + await AliasResolver.register_aliases(configuration=self.configuration) return self async def __aexit__( diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 58da173b..ba75c199 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -291,15 +291,12 @@ async def register_aliases(cls, configuration: Configuration) -> None: )._storage_key ] = storage_id - if configuration.is_at_home: - # Bulk update the mapping in the default KVS with the configuration mapping. - client = await cls._get_default_kvs_client(configuration=configuration) - existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) - # Update the existing mapping with the configuration mapping. - existing_mapping.update(configuration_mapping) - # Store the updated mapping back in the KVS and in memory. - await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) - cls._alias_map = existing_mapping - else: - # Update only in-memory mapping when not at home - cls._alias_map.update(configuration_mapping) + # Bulk update the mapping in the default KVS with the configuration mapping. + client = await cls._get_default_kvs_client(configuration=configuration) + existing_mapping = ((await client.get_record(cls._ALIAS_MAPPING_KEY)) or {'value': {}}).get('value', {}) + + # Update the existing mapping with the configuration mapping. + existing_mapping.update(configuration_mapping) + # Store the updated mapping back in the KVS and in memory. + await client.set_record(cls._ALIAS_MAPPING_KEY, existing_mapping) + cls._alias_map.update(existing_mapping) diff --git a/tests/integration/test_storages.py b/tests/integration/test_storages.py index 7ad807fe..05002a80 100644 --- a/tests/integration/test_storages.py +++ b/tests/integration/test_storages.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from typing import cast import pytest @@ -8,7 +9,9 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue from apify import Actor, Configuration +from apify._configuration import ActorStorages from apify.storage_clients import ApifyStorageClient, MemoryStorageClient, SmartApifyStorageClient +from apify.storage_clients._apify._alias_resolving import AliasResolver @pytest.mark.parametrize( @@ -125,3 +128,53 @@ async def test_actor_implicit_storage_init(apify_token: str) -> None: assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_storages_alias_resolving(apify_token: str) -> None: + """Test that `AliasResolver.register_aliases` correctly updates default KVS with Actor storages.""" + + # Actor storages + datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + + # Set up the configuration and storage client for the test + configuration = Configuration( + default_key_value_store_id='default_kvs_id', + token=apify_token, + actor_storages=ActorStorages( + datasets=datasets, request_queues=request_queues, key_value_stores=key_value_stores + ), + ) + storage_client = ApifyStorageClient() + service_locator.set_configuration(configuration) + service_locator.set_storage_client(storage_client) + + client_cache_key = cast('tuple', storage_client.get_storage_client_cache_key(configuration))[-1] + # Add some unrelated pre-existing alias mapping (it should be preserved after registering aliases) + pre_existing_mapping = {f'KeyValueStore,pre_existing_alias,{client_cache_key}': 'pre_existing_id'} + + default_kvs = await KeyValueStore.open(configuration=configuration, storage_client=storage_client) + await default_kvs.set_value(AliasResolver._ALIAS_MAPPING_KEY, pre_existing_mapping) + + # Construct the expected mapping + expected_mapping = {} + for storage_type, storage_map in ( + ('Dataset', datasets), + ('KeyValueStore', key_value_stores), + ('RequestQueue', request_queues), + ): + for storage_alias, storage_id in storage_map.items(): + expected_mapping[ + ','.join( + (storage_type, '__default__' if storage_alias == 'default' else storage_alias, client_cache_key) + ) + ] = storage_id + expected_mapping.update(pre_existing_mapping) + + try: + configuration.default_key_value_store_id = default_kvs.id + await AliasResolver.register_aliases(configuration=configuration) + assert await default_kvs.get_value(AliasResolver._ALIAS_MAPPING_KEY) == expected_mapping + finally: + await default_kvs.drop() diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index e59fa5b9..461a103a 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -306,9 +306,9 @@ def test_actor_storage_json_env_var(monkeypatch: pytest.MonkeyPatch) -> None: """Test that actor_storages_json is parsed from JSON env var.""" import json - datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} + datasets = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + request_queues = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} + key_value_stores = {'default': 'default_dataset_id', 'custom': 'custom_dataset_id'} actor_storages_json = json.dumps( { diff --git a/tests/unit/storage_clients/test_alias_resolver.py b/tests/unit/storage_clients/test_alias_resolver.py index 3cfa0fa3..56821ca1 100644 --- a/tests/unit/storage_clients/test_alias_resolver.py +++ b/tests/unit/storage_clients/test_alias_resolver.py @@ -1,10 +1,6 @@ from __future__ import annotations -from crawlee import service_locator - -from apify import Actor -from apify._configuration import Configuration, ActorStorages -from apify.storage_clients import SmartApifyStorageClient, ApifyStorageClient +from apify._configuration import Configuration from apify.storage_clients._apify._alias_resolving import AliasResolver @@ -80,30 +76,3 @@ async def test_get_alias_map_returns_in_memory_map() -> None: AliasResolver._alias_map = {} result = await AliasResolver._get_alias_map(config) assert result == {} - - -async def test_register_aliases() -> None: - """Test that _get_alias_map loads the map from KVS when at home. - - AliasResolver works locally only """ - - - datasets = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - request_queues = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - key_value_stores = {"default": "default_dataset_id", "custom": "custom_dataset_id"} - - config = Configuration(is_at_home=False, - token='test-token', - actor_storages= ActorStorages( - datasets = datasets, - request_queues = request_queues, - key_value_stores = key_value_stores - ), - ) - storage_client = ApifyStorageClient() - service_locator.set_storage_client( - SmartApifyStorageClient(local_storage_client=storage_client, cloud_storage_client=storage_client) - ) - async with Actor(configuration=config): - d = await Actor.open_dataset(alias='default') - assert d