diff --git a/tests/e2e/actor_source_base/server.py b/tests/e2e/actor_source_base/server.py index c21ecd9e..fd5d1f38 100644 --- a/tests/e2e/actor_source_base/server.py +++ b/tests/e2e/actor_source_base/server.py @@ -1,31 +1,42 @@ +"""Test HTTP server for e2e tests. + +Serves an e-commerce test website with a category-based structure for testing crawl depth: + + / (depth 0) - Homepage with links to products, categories, about page, and deep chain + /categories/electronics (depth 1) - Links to products 1 and 2 + /categories/home (depth 1) - Links to product 3 + /about (depth 1) - About page + /deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain) + /products/1 (depth 1 or 2) - Widget A + /products/2 (depth 1 or 2) - Widget B + /products/3 (depth 1 or 2) - Widget C + +The homepage includes both direct product links (for Scrapy spiders that look for /products/ links +on the start page) and category links (for testing crawl depth with Crawlee crawlers). +With max_crawl_depth=2, the crawler reaches all products and categories but does not go beyond /deep/2. """ -Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages. -For example: - http://localhost:8080/ contains links: -http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9 - http://localhost:8080/1 contains links: -http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19 - -... and so on. -""" +from __future__ import annotations import asyncio import logging from collections.abc import Awaitable, Callable, Coroutine -from socket import socket from typing import Any from uvicorn import Config from uvicorn.server import Server -from yarl import URL Receive = Callable[[], Awaitable[dict[str, Any]]] Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} -async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None: - """Send an HTML response to the client.""" + +async def _send_html(send: Send, html: str, status: int = 200) -> None: await send( { 'type': 'http.response.start', @@ -33,62 +44,90 @@ async def send_html_response(send: Send, html_content: bytes, status: int = 200) 'headers': [[b'content-type', b'text/html; charset=utf-8']], } ) - await send({'type': 'http.response.body', 'body': html_content}) - + await send({'type': 'http.response.body', 'body': html.encode()}) -async def app(scope: dict[str, Any], _: Receive, send: Send) -> None: - """Main ASGI application handler that routes requests to specific handlers. - Args: - scope: The ASGI connection scope. - _: The ASGI receive function. - send: The ASGI send function. - """ +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: assert scope['type'] == 'http' path = scope['path'] - links = '\n'.join(f'{path}{i}' for i in range(10)) - await send_html_response( - send, - f"""\ - - Title for {path} - - - {links} -""".encode(), - ) - - -class TestServer(Server): - """A test HTTP server implementation based on Uvicorn Server.""" - - @property - def url(self) -> URL: - """Get the base URL of the server. - - Returns: - A URL instance with the server's base URL. - """ - protocol = 'https' if self.config.is_ssl else 'http' - return URL(f'{protocol}://{self.config.host}:{self.config.port}/') - - async def serve(self, sockets: list[socket] | None = None) -> None: - """Run the server.""" - if sockets: - raise RuntimeError('Simple TestServer does not support custom sockets') - self.restart_requested = asyncio.Event() - - loop = asyncio.get_event_loop() - tasks = { - loop.create_task(super().serve()), - } - await asyncio.wait(tasks) + if path == '/': + await _send_html( + send, + 'E-commerce Test Store' + '

Welcome to Test Store

' + 'Widget A' + 'Widget B' + 'Widget C' + 'Electronics' + 'Home & Garden' + 'About Us' + 'Explore More' + '', + ) + elif path == '/categories/electronics': + await _send_html( + send, + 'Electronics' + '

Electronics

' + 'Widget A' + 'Widget B' + 'Back to Home' + '', + ) + elif path == '/categories/home': + await _send_html( + send, + 'Home & Garden' + '

Home & Garden

' + 'Widget C' + 'Back to Home' + '', + ) + elif path.startswith('/products/'): + product = _PRODUCTS.get(path.split('/')[-1]) + if product: + await _send_html( + send, + f'{product["name"]}' + f'

{product["name"]}

' + f'{product["price"]}' + f'

{product["description"]}

' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'About Us' + '

About Test Store

' + '

We sell the best widgets in the world.

' + 'Back to Home' + '', + ) + elif path.startswith('/deep/'): + try: + n = int(path.split('/')[-1]) + except ValueError: + await _send_html(send, 'Not Found', 404) + return + await _send_html( + send, + f'Deep Page {n}' + f'

Deep Page {n}

' + f'Go Deeper' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) if __name__ == '__main__': asyncio.run( - TestServer( + Server( config=Config( app=app, lifespan='off', diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index cb894087..4ae56fc6 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -168,12 +168,9 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: 'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}' ) - current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]]) - integration_tests_python_version = ( - os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version - ) + python_version = f'{sys.version_info[0]}.{sys.version_info[1]}' source_files['Dockerfile'] = str(source_files['Dockerfile']).replace( - 'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version + 'BASE_IMAGE_VERSION_PLACEHOLDER', python_version ) return source_files @@ -190,6 +187,7 @@ def __call__( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, additional_requirements: list[str] | None = None, + memory_mbytes: int = 256, ) -> Awaitable[ActorClientAsync]: """Create a temporary Actor from the given main function or source files. @@ -204,6 +202,7 @@ def __call__( main_py: The `src/main.py` file of the Actor. source_files: A dictionary of the source files of the Actor. additional_requirements: A list of additional requirements to be added to the `requirements.txt`. + memory_mbytes: The default memory allocation for the Actor run in MB. Returns: A resource client for the created Actor. @@ -229,6 +228,7 @@ async def _make_actor( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, additional_requirements: list[str] | None = None, + memory_mbytes: int = 256, ) -> ActorClientAsync: if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -298,7 +298,7 @@ async def _make_actor( created_actor = await client.actors().create( name=actor_name, default_run_build='latest', - default_run_memory_mbytes=256, + default_run_memory_mbytes=memory_mbytes, default_run_timeout_secs=600, versions=[ { diff --git a/tests/e2e/test_crawlee_crawlers/__init__.py b/tests/e2e/test_crawlee_crawlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py new file mode 100644 index 00000000..55e0ec8f --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: AdaptivePlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.parsed_content.css('h1::text').get('').strip() + price = context.parsed_content.css('span.price::text').get('').strip() + description = context.parsed_content.css('p.description::text').get('').strip() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'AdaptivePlaywrightCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py new file mode 100644 index 00000000..98ec114b --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from html.parser import HTMLParser +from typing import TYPE_CHECKING + +from crawlee.crawlers import BasicCrawler + +from apify import Actor + +if TYPE_CHECKING: + from crawlee._types import BasicCrawlingContext + + +class _PageParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.links: list[str] = [] + self.data: dict[str, str] = {} + self._in_tag: str | None = None + self._in_class: str = '' + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + attrs_dict = dict(attrs) + if tag == 'a' and (href := attrs_dict.get('href')): + self.links.append(href) + self._in_tag = tag + self._in_class = attrs_dict.get('class', '') or '' + + def handle_endtag(self, tag: str) -> None: # noqa: ARG002 + self._in_tag = None + self._in_class = '' + + def handle_data(self, data: str) -> None: + text = data.strip() + if not text: + return + if self._in_tag == 'h1': + self.data['name'] = text + elif self._in_tag == 'span' and self._in_class == 'price': + self.data['price'] = text + elif self._in_tag == 'p' and self._in_class == 'description': + self.data['description'] = text + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BasicCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + pages_visited.append(context.request.url) + + response = await context.send_request(context.request.url) + html = (await response.read()).decode() + + parser = _PageParser() + parser.feed(html) + + base_url = 'http://localhost:8080' + await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')]) + + if '/products/' in context.request.url and parser.data.get('name'): + await context.push_data({'url': context.request.url, **parser.data}) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BasicCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py new file mode 100644 index 00000000..09170d7b --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BeautifulSoupCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name_tag = context.soup.find('h1') + price_tag = context.soup.find('span', class_='price') + desc_tag = context.soup.find('p', class_='description') + if name_tag: + await context.push_data( + { + 'url': context.request.url, + 'name': name_tag.get_text(strip=True), + 'price': price_tag.get_text(strip=True) if price_tag else '', + 'description': desc_tag.get_text(strip=True) if desc_tag else '', + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BeautifulSoupCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py new file mode 100644 index 00000000..c9c2bc5a --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = HttpCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: HttpCrawlingContext) -> None: + pages_visited.append(context.request.url) + html = (await context.http_response.read()).decode() + + links = re.findall(r'href="(/[^"]*)"', html) + base_url = 'http://localhost:8080' + await context.add_requests([f'{base_url}{link}' for link in links]) + + if '/products/' in context.request.url: + name_match = re.search(r'

(.*?)

', html) + price_match = re.search(r'(.*?)', html) + desc_match = re.search(r'

(.*?)

', html) + if name_match: + await context.push_data( + { + 'url': context.request.url, + 'name': name_match.group(1), + 'price': price_match.group(1) if price_match else '', + 'description': desc_match.group(1) if desc_match else '', + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'HttpCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py new file mode 100644 index 00000000..38800dfb --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = ParselCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.selector.css('h1::text').get('').strip() + price = context.selector.css('span.price::text').get('').strip() + description = context.selector.css('p.description::text').get('').strip() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'ParselCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py new file mode 100644 index 00000000..650dda7d --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = PlaywrightCrawler(max_crawl_depth=2) + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = await context.page.locator('h1').text_content() + price = await context.page.locator('span.price').text_content() + description = await context.page.locator('p.description').text_content() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name.strip(), + 'price': (price or '').strip(), + 'description': (description or '').strip(), + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'PlaywrightCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile b/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile new file mode 100644 index 00000000..99c0e5f7 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile @@ -0,0 +1,9 @@ +FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER + +COPY . ./ + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN pip install --force-reinstall -r requirements.txt + +CMD ["sh", "-c", "python server.py & python -m src"] diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py new file mode 100644 index 00000000..9965e5cc --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from apify_client.clients.resource_clients import ActorClientAsync + + from apify._models import ActorRun + +_PYTHON_VERSION = f'{sys.version_info[0]}.{sys.version_info[1]}' + +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' + + +def read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() + + +def get_playwright_dockerfile() -> str: + return read_actor_source('playwright.Dockerfile').replace( + 'PYTHON_VERSION_PLACEHOLDER', + _PYTHON_VERSION, + ) + + +_EXPECTED_PRODUCTS = { + 'Widget A': {'price': '$19.99', 'description': 'A basic widget for everyday use'}, + 'Widget B': {'price': '$29.99', 'description': 'An advanced widget with extra features'}, + 'Widget C': {'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def verify_crawler_results( + actor: ActorClientAsync, + run_result: ActorRun, + expected_crawler_type: str, +) -> None: + """Verify dataset items and KVS record after a crawler Actor run.""" + assert run_result.status == 'SUCCEEDED' + + # Verify dataset items. + items = await actor.last_run().dataset().list_items() + assert items.count == 3 + + items_by_name = {item['name']: item for item in items.items} + + for name, expected in _EXPECTED_PRODUCTS.items(): + assert name in items_by_name, f'Missing product: {name}' + item = items_by_name[name] + assert 'url' in item + assert item['price'] == expected['price'] + assert item['description'] == expected['description'] + + # Verify KVS record. + kvs_record = await actor.last_run().key_value_store().get_record('CRAWLER_RESULT') + assert kvs_record is not None + result = kvs_record['value'] + assert result['crawler_type'] == expected_crawler_type + # With max_crawl_depth=2, the server has 9 pages reachable (homepage, 2 categories, about, /deep/1, + # 3 products, /deep/2). The crawler should visit most of them but not go beyond /deep/2. + assert result['pages_visited_count'] >= 5 + assert result['pages_visited_count'] <= 15 diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py new file mode 100644 index 00000000..bc84a50a --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-adaptive', + source_files={ + 'src/main.py': read_actor_source('main_adaptive_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), + }, + additional_requirements=['crawlee[all]'], + memory_mbytes=1024, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py new file mode 100644 index 00000000..8667f393 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-basic', + source_files={ + 'src/main.py': read_actor_source('main_basic_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'BasicCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py new file mode 100644 index 00000000..56d0a497 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-bsoup', + source_files={ + 'src/main.py': read_actor_source('main_beautifulsoup_crawler.py'), + }, + additional_requirements=['crawlee[beautifulsoup]'], + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py new file mode 100644 index 00000000..86dc0e37 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-http', + source_files={ + 'src/main.py': read_actor_source('main_http_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'HttpCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py new file mode 100644 index 00000000..0815c9f2 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-parsel', + source_files={ + 'src/main.py': read_actor_source('main_parsel_crawler.py'), + }, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'ParselCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py new file mode 100644 index 00000000..5d7d8244 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + + +async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-playwright', + source_files={ + 'src/main.py': read_actor_source('main_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), + }, + additional_requirements=['crawlee[playwright]'], + memory_mbytes=1024, + ) + run_result = await run_actor(actor) + await verify_crawler_results(actor, run_result, 'PlaywrightCrawler') diff --git a/tests/e2e/test_crawlers_with_storages.py b/tests/e2e/test_crawlers_with_storages.py index cd0d6941..89ff85bb 100644 --- a/tests/e2e/test_crawlers_with_storages.py +++ b/tests/e2e/test_crawlers_with_storages.py @@ -23,7 +23,7 @@ async def main() -> None: async with Actor: crawler = ParselCrawler(max_crawl_depth=2) finished = [] - enqueue_pattern = re.compile(r'http://localhost:8080/2+$') + enqueue_pattern = re.compile(r'http://localhost:8080/deep/\d+$') @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: @@ -33,7 +33,11 @@ async def default_handler(context: ParselCrawlingContext) -> None: finished.append(context.request.url) await crawler.run(['http://localhost:8080/']) - assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] + assert finished == [ + 'http://localhost:8080/', + 'http://localhost:8080/deep/1', + 'http://localhost:8080/deep/2', + ] actor = await make_actor(label='crawler-max-depth', main_func=main) run_result = await run_actor(actor) diff --git a/tests/e2e/test_scrapy/actor_source/server.py b/tests/e2e/test_scrapy/actor_source/server.py deleted file mode 100644 index 20aff81a..00000000 --- a/tests/e2e/test_scrapy/actor_source/server.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -from collections.abc import Awaitable, Callable, Coroutine -from typing import Any - -from uvicorn import Config -from uvicorn.server import Server - -Receive = Callable[[], Awaitable[dict[str, Any]]] -Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] - -_PRODUCTS = { - '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, - '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, - '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, -} - - -async def _send_html(send: Send, html: str, status: int = 200) -> None: - await send( - { - 'type': 'http.response.start', - 'status': status, - 'headers': [[b'content-type', b'text/html; charset=utf-8']], - } - ) - await send({'type': 'http.response.body', 'body': html.encode()}) - - -async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - assert scope['type'] == 'http' - path = scope['path'] - - if path == '/': - await _send_html( - send, - 'E-commerce Test Store' - '

Welcome to Test Store

' - 'Widget A' - 'Widget B' - 'Widget C' - 'About Us' - '', - ) - elif path.startswith('/products/'): - product = _PRODUCTS.get(path.split('/')[-1]) - if product: - await _send_html( - send, - f'{product["name"]}' - f'

{product["name"]}

' - f'{product["price"]}' - f'

{product["description"]}

' - f'Back to Home' - f'', - ) - else: - await _send_html(send, 'Not Found', 404) - elif path == '/about': - await _send_html( - send, - 'About Us' - '

About Test Store

' - '

We sell the best widgets in the world.

' - 'Back to Home' - '', - ) - else: - await _send_html(send, 'Not Found', 404) - - -if __name__ == '__main__': - asyncio.run( - Server( - config=Config( - app=app, - lifespan='off', - loop='asyncio', - port=8080, - log_config=None, - log_level=logging.CRITICAL, - ) - ).serve() - ) diff --git a/tests/e2e/test_scrapy/conftest.py b/tests/e2e/test_scrapy/conftest.py index f5c0cc10..e19f6c36 100644 --- a/tests/e2e/test_scrapy/conftest.py +++ b/tests/e2e/test_scrapy/conftest.py @@ -22,7 +22,6 @@ def get_scrapy_source_files( extra_source_files: dict[str, str] | None = None, ) -> dict[str, str]: source_files: dict[str, str] = { - 'server.py': read_actor_source('server.py'), 'src/__main__.py': read_actor_source('__main__.py'), 'src/main.py': read_actor_source('main.py'), 'src/settings.py': read_actor_source('settings.py'),