diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 1697b8bb..7d790b7d 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -17,13 +17,13 @@ import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py' ## Integrating Scrapy with the Apify platform -The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. +The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. {UnderscoreMainExample} -In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed. +In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` installs Twisted's asyncio-compatible reactor and bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed. Make sure the `SCRAPY_SETTINGS_MODULE` environment variable is set to the path of the Scrapy settings module. This variable is also used by the `Actor` class to detect that the project is a Scrapy project, triggering additional actions. @@ -47,7 +47,7 @@ Additional helper functions in the [`apify.scrapy`](https://github.com/apify/api - `apply_apify_settings` - Applies Apify-specific components to Scrapy settings. - `to_apify_request` and `to_scrapy_request` - Convert between Apify and Scrapy request objects. - `initialize_logging` - Configures logging for the Actor environment. -- `run_scrapy_actor` - Bridges asyncio and Twisted event loops. +- `run_scrapy_actor` - Installs Twisted's asyncio reactor and bridges asyncio and Twisted event loops. ## Create a new Apify-Scrapy project diff --git a/docs/03_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py index bc2cf9ba..807447c9 100644 --- a/docs/03_guides/code/scrapy_project/src/__main__.py +++ b/docs/03_guides/code/scrapy_project/src/__main__.py @@ -1,11 +1,5 @@ from __future__ import annotations -from scrapy.utils.reactor import install_reactor - -# Install Twisted's asyncio reactor before importing any other Twisted or -# Scrapy components. -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') - import os from apify.scrapy import initialize_logging, run_scrapy_actor diff --git a/docs/03_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py index 608a867b..d8b67984 100644 --- a/docs/03_guides/code/scrapy_project/src/main.py +++ b/docs/03_guides/code/scrapy_project/src/main.py @@ -1,8 +1,8 @@ from __future__ import annotations + import asyncio -from scrapy.crawler import CrawlerRunner -from scrapy.utils.defer import deferred_to_future +from scrapy.crawler import AsyncCrawlerRunner from apify import Actor from apify.scrapy import apply_apify_settings @@ -23,14 +23,13 @@ async def main() -> None: # Apply Apify settings, which will override the Scrapy project settings. settings = apply_apify_settings(proxy_config=proxy_config) - # Create CrawlerRunner and execute the Scrapy spider. - crawler_runner = CrawlerRunner(settings) - crawl_deferred = crawler_runner.crawl( + # Create AsyncCrawlerRunner and execute the Scrapy spider. + crawler_runner = AsyncCrawlerRunner(settings) + await crawler_runner.crawl( Spider, start_urls=start_urls, allowed_domains=allowed_domains, ) - await deferred_to_future(crawl_deferred) if __name__ == '__main__': diff --git a/pyproject.toml b/pyproject.toml index e78e5474..6bc15d04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ ] [project.optional-dependencies] -scrapy = ["scrapy>=2.11.0"] +scrapy = ["scrapy>=2.14.0"] [project.urls] "Apify Homepage" = "https://apify.com" @@ -161,10 +161,6 @@ indent-style = "space" "PLR2004", # Magic value used in comparison, consider replacing `{value}` with a constant variable "PLW0603", # Using the global statement to update `{name}` is discouraged ] -"**/docs/**/scrapy_project/**/__main__.py" = [ - # Because of asyncioreactor.install() call. - "E402", # Module level import not at top of file -] "**/docs/**/scrapy_project/**" = [ # Local imports are mixed up with the Apify SDK. "I001", # Import block is un-sorted or un-formatted diff --git a/src/apify/scrapy/_actor_runner.py b/src/apify/scrapy/_actor_runner.py index 390b2fc3..58cb17f9 100644 --- a/src/apify/scrapy/_actor_runner.py +++ b/src/apify/scrapy/_actor_runner.py @@ -3,24 +3,22 @@ import asyncio from typing import TYPE_CHECKING -from twisted.internet.defer import Deferred, ensureDeferred -from twisted.internet.task import react - if TYPE_CHECKING: from collections.abc import Coroutine -async def _run_coro_as_deferred(coro: Coroutine) -> None: - """Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred.""" - task = asyncio.ensure_future(coro) - await Deferred.fromFuture(task) - - def run_scrapy_actor(coro: Coroutine) -> None: """Start Twisted's reactor and execute the provided Actor coroutine. - This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the - Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops, - enabling the Apify and Scrapy integration to work together. + This function installs Twisted's asyncio-compatible reactor, then initiates it and runs the given asyncio + coroutine (typically the Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted + event loops, enabling the Apify and Scrapy integration to work together. """ - react(lambda _: ensureDeferred(_run_coro_as_deferred(coro))) + from scrapy.utils.reactor import install_reactor # noqa: PLC0415 + + install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') + + from twisted.internet.defer import Deferred # noqa: PLC0415 + from twisted.internet.task import react # noqa: PLC0415 + + react(lambda _reactor: Deferred.fromFuture(asyncio.ensure_future(coro))) diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index 995af774..b3a80187 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -8,7 +8,7 @@ from apify import Actor if TYPE_CHECKING: - from scrapy import Item, Spider + from scrapy import Item logger = getLogger(__name__) @@ -22,10 +22,9 @@ class ActorDatasetPushPipeline: async def process_item( self, item: Item, - spider: Spider, ) -> Item: """Pushes the provided Scrapy item to the Actor's default dataset.""" item_dict = ItemAdapter(item).asdict() - logger.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.') + logger.debug(f'Pushing item={item_dict} to the dataset.') await Actor.push_data(item_dict) return item diff --git a/tests/e2e/test_actor_scrapy.py b/tests/e2e/test_actor_scrapy.py index b4dd5fee..c7327b58 100644 --- a/tests/e2e/test_actor_scrapy.py +++ b/tests/e2e/test_actor_scrapy.py @@ -28,7 +28,7 @@ async def test_actor_scrapy_title_spider( actor = await make_actor( 'actor-scrapy-title-spider', source_files=actor_source_files, - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor( actor, diff --git a/tests/e2e/test_scrapy/actor_source/__main__.py b/tests/e2e/test_scrapy/actor_source/__main__.py index edfdaae5..8443f09b 100644 --- a/tests/e2e/test_scrapy/actor_source/__main__.py +++ b/tests/e2e/test_scrapy/actor_source/__main__.py @@ -1,14 +1,9 @@ from __future__ import annotations -from scrapy.utils.reactor import install_reactor +import os -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') - -import os # noqa: E402, I001 - -from apify.scrapy import initialize_logging, run_scrapy_actor # noqa: E402 - -from .main import main # noqa: E402 +from .main import main +from apify.scrapy import initialize_logging, run_scrapy_actor os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' diff --git a/tests/e2e/test_scrapy/actor_source/main.py b/tests/e2e/test_scrapy/actor_source/main.py index 173b5961..e28a0a98 100644 --- a/tests/e2e/test_scrapy/actor_source/main.py +++ b/tests/e2e/test_scrapy/actor_source/main.py @@ -1,16 +1,14 @@ -from __future__ import annotations # noqa: I001 +from __future__ import annotations -from scrapy.crawler import CrawlerRunner -from scrapy.utils.defer import deferred_to_future +from scrapy.crawler import AsyncCrawlerRunner +from .spiders import Spider # ty: ignore[unresolved-import] from apify import Actor from apify.scrapy import apply_apify_settings -from .spiders import Spider # ty: ignore[unresolved-import] - async def main() -> None: async with Actor: settings = apply_apify_settings() - runner = CrawlerRunner(settings) - await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/'])) + runner = AsyncCrawlerRunner(settings) + await runner.crawl(Spider, start_urls=['http://localhost:8080/']) diff --git a/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py index 39250de3..e29ae914 100644 --- a/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py +++ b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py @@ -1,19 +1,17 @@ -from __future__ import annotations # noqa: I001 +from __future__ import annotations import os -from scrapy.crawler import CrawlerRunner -from scrapy.utils.defer import deferred_to_future +from scrapy.crawler import AsyncCrawlerRunner +from .spiders import Spider # ty: ignore[unresolved-import] from apify import Actor from apify.scrapy import apply_apify_settings -from .spiders import Spider # ty: ignore[unresolved-import] - async def main() -> None: async with Actor: os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline' settings = apply_apify_settings() - runner = CrawlerRunner(settings) - await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/'])) + runner = AsyncCrawlerRunner(settings) + await runner.crawl(Spider, start_urls=['http://localhost:8080/']) diff --git a/tests/e2e/test_scrapy/actor_source/pipelines.py b/tests/e2e/test_scrapy/actor_source/pipelines.py index 2367baf1..3aa0d8c0 100644 --- a/tests/e2e/test_scrapy/actor_source/pipelines.py +++ b/tests/e2e/test_scrapy/actor_source/pipelines.py @@ -3,14 +3,13 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from scrapy import Item, Spider + from scrapy import Item class PriceCleanerPipeline: def process_item( self, item: Item, - _: Spider, ) -> Item: if 'price' in item and isinstance(item['price'], str): item['price'] = item['price'].lstrip('$') diff --git a/tests/e2e/test_scrapy/test_basic_spider.py b/tests/e2e/test_scrapy/test_basic_spider.py index b0a69c07..c4101a25 100644 --- a/tests/e2e/test_scrapy/test_basic_spider.py +++ b/tests/e2e/test_scrapy/test_basic_spider.py @@ -12,7 +12,7 @@ async def test_basic_spider(make_actor: MakeActorFunction, run_actor: RunActorFu actor = await make_actor( label='scrapy-basic', source_files=get_scrapy_source_files('spider_basic.py', 'BasicSpider'), - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor(actor) await verify_spider_results(actor, run_result) diff --git a/tests/e2e/test_scrapy/test_cb_kwargs_spider.py b/tests/e2e/test_scrapy/test_cb_kwargs_spider.py index 599dd6d2..57692fc0 100644 --- a/tests/e2e/test_scrapy/test_cb_kwargs_spider.py +++ b/tests/e2e/test_scrapy/test_cb_kwargs_spider.py @@ -30,7 +30,7 @@ async def test_cb_kwargs_spider(make_actor: MakeActorFunction, run_actor: RunAct actor = await make_actor( label='scrapy-cb-kwargs', source_files=get_scrapy_source_files('spider_cb_kwargs.py', 'CbKwargsSpider'), - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor(actor) await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS) diff --git a/tests/e2e/test_scrapy/test_crawl_spider.py b/tests/e2e/test_scrapy/test_crawl_spider.py index f4c3b7b2..957965cd 100644 --- a/tests/e2e/test_scrapy/test_crawl_spider.py +++ b/tests/e2e/test_scrapy/test_crawl_spider.py @@ -12,7 +12,7 @@ async def test_crawl_spider(make_actor: MakeActorFunction, run_actor: RunActorFu actor = await make_actor( label='scrapy-crawl', source_files=get_scrapy_source_files('spider_crawl.py', 'CrawlProductSpider'), - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor(actor) await verify_spider_results(actor, run_result) diff --git a/tests/e2e/test_scrapy/test_custom_pipeline_spider.py b/tests/e2e/test_scrapy/test_custom_pipeline_spider.py index 65f03cc9..7c0e8307 100644 --- a/tests/e2e/test_scrapy/test_custom_pipeline_spider.py +++ b/tests/e2e/test_scrapy/test_custom_pipeline_spider.py @@ -26,7 +26,7 @@ async def test_custom_pipeline_spider(make_actor: MakeActorFunction, run_actor: 'src/pipelines.py': read_actor_source('pipelines.py'), }, ), - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor(actor) await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS) diff --git a/tests/e2e/test_scrapy/test_itemloader_spider.py b/tests/e2e/test_scrapy/test_itemloader_spider.py index 1b486d66..f1ce73c4 100644 --- a/tests/e2e/test_scrapy/test_itemloader_spider.py +++ b/tests/e2e/test_scrapy/test_itemloader_spider.py @@ -12,7 +12,7 @@ async def test_itemloader_spider(make_actor: MakeActorFunction, run_actor: RunAc actor = await make_actor( label='scrapy-itemloader', source_files=get_scrapy_source_files('spider_itemloader.py', 'ItemLoaderSpider'), - additional_requirements=['scrapy~=2.12.0'], + additional_requirements=['scrapy>=2.14.0'], ) run_result = await run_actor(actor) await verify_spider_results(actor, run_result) diff --git a/tests/unit/scrapy/pipelines/test_actor_dataset_push.py b/tests/unit/scrapy/pipelines/test_actor_dataset_push.py index c8ed5732..459e9818 100644 --- a/tests/unit/scrapy/pipelines/test_actor_dataset_push.py +++ b/tests/unit/scrapy/pipelines/test_actor_dataset_push.py @@ -3,16 +3,12 @@ from dataclasses import dataclass import pytest -from scrapy import Field, Item, Spider +from scrapy import Field, Item from apify import Actor from apify.scrapy.pipelines import ActorDatasetPushPipeline -class DummySpider(Spider): - name = 'dummy_spider' - - class DummyItem(Item): a = Field() b = Field() @@ -24,12 +20,6 @@ class TitleItem(Item): title = Field() -@pytest.fixture -def spider() -> DummySpider: - """Fixture to create a "dummy" Scrapy spider.""" - return DummySpider() - - @pytest.fixture def pipeline() -> ActorDatasetPushPipeline: """Fixture to create an Actor dataset push pipeline.""" @@ -67,7 +57,6 @@ class ItemTestCase: async def test_process_item( monkeypatch: pytest.MonkeyPatch, pipeline: ActorDatasetPushPipeline, - spider: Spider, tc: ItemTestCase, ) -> None: dataset = [] @@ -79,9 +68,9 @@ async def mock_push_data(item: dict) -> None: if tc.expected_exception: with pytest.raises(tc.expected_exception): - await pipeline.process_item(tc.item, spider) + await pipeline.process_item(tc.item) else: - output = await pipeline.process_item(tc.item, spider) + output = await pipeline.process_item(tc.item) assert output == tc.item assert dataset == [tc.item_dict] diff --git a/uv.lock b/uv.lock index c20ae954..3e545c4d 100644 --- a/uv.lock +++ b/uv.lock @@ -84,7 +84,7 @@ requires-dist = [ { name = "lazy-object-proxy", specifier = ">=1.11.0" }, { name = "more-itertools", specifier = ">=10.2.0" }, { name = "pydantic", specifier = ">=2.11.0" }, - { name = "scrapy", marker = "extra == 'scrapy'", specifier = ">=2.11.0" }, + { name = "scrapy", marker = "extra == 'scrapy'", specifier = ">=2.14.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "websockets", specifier = ">=14.0" }, { name = "yarl", specifier = ">=1.18.0" },