diff --git a/tests/e2e/actor_source_base/server.py b/tests/e2e/actor_source_base/server.py index c21ecd9e..fd5d1f38 100644 --- a/tests/e2e/actor_source_base/server.py +++ b/tests/e2e/actor_source_base/server.py @@ -1,31 +1,42 @@ +"""Test HTTP server for e2e tests. + +Serves an e-commerce test website with a category-based structure for testing crawl depth: + + / (depth 0) - Homepage with links to products, categories, about page, and deep chain + /categories/electronics (depth 1) - Links to products 1 and 2 + /categories/home (depth 1) - Links to product 3 + /about (depth 1) - About page + /deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain) + /products/1 (depth 1 or 2) - Widget A + /products/2 (depth 1 or 2) - Widget B + /products/3 (depth 1 or 2) - Widget C + +The homepage includes both direct product links (for Scrapy spiders that look for /products/ links +on the start page) and category links (for testing crawl depth with Crawlee crawlers). +With max_crawl_depth=2, the crawler reaches all products and categories but does not go beyond /deep/2. """ -Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages. -For example: - http://localhost:8080/ contains links: -http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9 - http://localhost:8080/1 contains links: -http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19 - -... and so on. -""" +from __future__ import annotations import asyncio import logging from collections.abc import Awaitable, Callable, Coroutine -from socket import socket from typing import Any from uvicorn import Config from uvicorn.server import Server -from yarl import URL Receive = Callable[[], Awaitable[dict[str, Any]]] Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} -async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None: - """Send an HTML response to the client.""" + +async def _send_html(send: Send, html: str, status: int = 200) -> None: await send( { 'type': 'http.response.start', @@ -33,62 +44,90 @@ async def send_html_response(send: Send, html_content: bytes, status: int = 200) 'headers': [[b'content-type', b'text/html; charset=utf-8']], } ) - await send({'type': 'http.response.body', 'body': html_content}) - + await send({'type': 'http.response.body', 'body': html.encode()}) -async def app(scope: dict[str, Any], _: Receive, send: Send) -> None: - """Main ASGI application handler that routes requests to specific handlers. - Args: - scope: The ASGI connection scope. - _: The ASGI receive function. - send: The ASGI send function. - """ +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: assert scope['type'] == 'http' path = scope['path'] - links = '\n'.join(f'{path}{i}' for i in range(10)) - await send_html_response( - send, - f"""\ -
-{product["description"]}
' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'We sell the best widgets in the world.
' + 'Back to Home' + '', + ) + elif path.startswith('/deep/'): + try: + n = int(path.split('/')[-1]) + except ValueError: + await _send_html(send, 'Not Found', 404) + return + await _send_html( + send, + f'(.*?)
', html) + if name_match: + await context.push_data( + { + 'url': context.request.url, + 'name': name_match.group(1), + 'price': price_match.group(1) if price_match else '', + 'description': desc_match.group(1) if desc_match else '', + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'HttpCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py new file mode 100644 index 00000000..38800dfb --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = ParselCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.selector.css('h1::text').get('').strip() + price = context.selector.css('span.price::text').get('').strip() + description = context.selector.css('p.description::text').get('').strip() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'ParselCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py new file mode 100644 index 00000000..650dda7d --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = PlaywrightCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = await context.page.locator('h1').text_content() + price = await context.page.locator('span.price').text_content() + description = await context.page.locator('p.description').text_content() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name.strip(), + 'price': (price or '').strip(), + 'description': (description or '').strip(), + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'PlaywrightCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile b/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile new file mode 100644 index 00000000..99c0e5f7 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile @@ -0,0 +1,9 @@ +FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER + +COPY . ./ + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN pip install --force-reinstall -r requirements.txt + +CMD ["sh", "-c", "python server.py & python -m src"] diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py new file mode 100644 index 00000000..9965e5cc --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from apify_client.clients.resource_clients import ActorClientAsync + + from apify._models import ActorRun + +_PYTHON_VERSION = f'{sys.version_info[0]}.{sys.version_info[1]}' + +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' + + +def read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() + + +def get_playwright_dockerfile() -> str: + return read_actor_source('playwright.Dockerfile').replace( + 'PYTHON_VERSION_PLACEHOLDER', + _PYTHON_VERSION, + ) + + +_EXPECTED_PRODUCTS = { + 'Widget A': {'price': '$19.99', 'description': 'A basic widget for everyday use'}, + 'Widget B': {'price': '$29.99', 'description': 'An advanced widget with extra features'}, + 'Widget C': {'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def verify_crawler_results( + actor: ActorClientAsync, + run_result: ActorRun, + expected_crawler_type: str, +) -> None: + """Verify dataset items and KVS record after a crawler Actor run.""" + assert run_result.status == 'SUCCEEDED' + + # Verify dataset items. + items = await actor.last_run().dataset().list_items() + assert items.count == 3 + + items_by_name = {item['name']: item for item in items.items} + + for name, expected in _EXPECTED_PRODUCTS.items(): + assert name in items_by_name, f'Missing product: {name}' + item = items_by_name[name] + assert 'url' in item + assert item['price'] == expected['price'] + assert item['description'] == expected['description'] + + # Verify KVS record. + kvs_record = await actor.last_run().key_value_store().get_record('CRAWLER_RESULT') + assert kvs_record is not None + result = kvs_record['value'] + assert result['crawler_type'] == expected_crawler_type + # With max_crawl_depth=2, the server has 9 pages reachable (homepage, 2 categories, about, /deep/1, + # 3 products, /deep/2). The crawler should visit most of them but not go beyond /deep/2. + assert result['pages_visited_count'] >= 5 + assert result['pages_visited_count'] <= 15 diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py new file mode 100644 index 00000000..bc84a50a --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-adaptive', + source_files={ + 'src/main.py': read_actor_source('main_adaptive_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), + }, + additional_requirements=['crawlee[all]'], + memory_mbytes=1024, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py new file mode 100644 index 00000000..8667f393 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-basic', + source_files={ + 'src/main.py': read_actor_source('main_basic_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'BasicCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py new file mode 100644 index 00000000..56d0a497 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-bsoup', + source_files={ + 'src/main.py': read_actor_source('main_beautifulsoup_crawler.py'), + }, + additional_requirements=['crawlee[beautifulsoup]'], + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py new file mode 100644 index 00000000..86dc0e37 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-http', + source_files={ + 'src/main.py': read_actor_source('main_http_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'HttpCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py new file mode 100644 index 00000000..0815c9f2 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-parsel', + source_files={ + 'src/main.py': read_actor_source('main_parsel_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'ParselCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py new file mode 100644 index 00000000..5d7d8244 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-playwright', + source_files={ + 'src/main.py': read_actor_source('main_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), + }, + additional_requirements=['crawlee[playwright]'], + memory_mbytes=1024, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'PlaywrightCrawler') diff --git a/tests/e2e/test_crawlers_with_storages.py b/tests/e2e/test_crawlers_with_storages.py index cd0d6941..89ff85bb 100644 --- a/tests/e2e/test_crawlers_with_storages.py +++ b/tests/e2e/test_crawlers_with_storages.py @@ -23,7 +23,7 @@ async def main() -> None: async with Actor: crawler = ParselCrawler(max_crawl_depth=2) finished = [] - enqueue_pattern = re.compile(r'http://localhost:8080/2+$') + enqueue_pattern = re.compile(r'http://localhost:8080/deep/\d+$') @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: @@ -33,7 +33,11 @@ async def default_handler(context: ParselCrawlingContext) -> None: finished.append(context.request.url) await crawler.run(['http://localhost:8080/']) - assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] + assert finished == [ + 'http://localhost:8080/', + 'http://localhost:8080/deep/1', + 'http://localhost:8080/deep/2', + ] actor = await make_actor(label='crawler-max-depth', main_func=main) run_result = await run_actor(actor) diff --git a/tests/e2e/test_scrapy/actor_source/server.py b/tests/e2e/test_scrapy/actor_source/server.py deleted file mode 100644 index 20aff81a..00000000 --- a/tests/e2e/test_scrapy/actor_source/server.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -from collections.abc import Awaitable, Callable, Coroutine -from typing import Any - -from uvicorn import Config -from uvicorn.server import Server - -Receive = Callable[[], Awaitable[dict[str, Any]]] -Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] - -_PRODUCTS = { - '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, - '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, - '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, -} - - -async def _send_html(send: Send, html: str, status: int = 200) -> None: - await send( - { - 'type': 'http.response.start', - 'status': status, - 'headers': [[b'content-type', b'text/html; charset=utf-8']], - } - ) - await send({'type': 'http.response.body', 'body': html.encode()}) - - -async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - assert scope['type'] == 'http' - path = scope['path'] - - if path == '/': - await _send_html( - send, - '{product["description"]}
' - f'Back to Home' - f'', - ) - else: - await _send_html(send, 'Not Found', 404) - elif path == '/about': - await _send_html( - send, - 'We sell the best widgets in the world.
' - 'Back to Home' - '', - ) - else: - await _send_html(send, 'Not Found', 404) - - -if __name__ == '__main__': - asyncio.run( - Server( - config=Config( - app=app, - lifespan='off', - loop='asyncio', - port=8080, - log_config=None, - log_level=logging.CRITICAL, - ) - ).serve() - ) diff --git a/tests/e2e/test_scrapy/conftest.py b/tests/e2e/test_scrapy/conftest.py index f5c0cc10..e19f6c36 100644 --- a/tests/e2e/test_scrapy/conftest.py +++ b/tests/e2e/test_scrapy/conftest.py @@ -22,7 +22,6 @@ def get_scrapy_source_files( extra_source_files: dict[str, str] | None = None, ) -> dict[str, str]: source_files: dict[str, str] = { - 'server.py': read_actor_source('server.py'), 'src/__main__.py': read_actor_source('__main__.py'), 'src/main.py': read_actor_source('main.py'), 'src/settings.py': read_actor_source('settings.py'),