Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,9 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]:
'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}'
)

current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]])
integration_tests_python_version = (
os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version
)
python_version = f'{sys.version_info[0]}.{sys.version_info[1]}'
source_files['Dockerfile'] = str(source_files['Dockerfile']).replace(
'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version
'BASE_IMAGE_VERSION_PLACEHOLDER', python_version
)

return source_files
Expand All @@ -190,6 +187,7 @@ def __call__(
main_py: str | None = None,
source_files: Mapping[str, str | bytes] | None = None,
additional_requirements: list[str] | None = None,
memory_mbytes: int = 256,
) -> Awaitable[ActorClientAsync]:
"""Create a temporary Actor from the given main function or source files.

Expand All @@ -204,6 +202,7 @@ def __call__(
main_py: The `src/main.py` file of the Actor.
source_files: A dictionary of the source files of the Actor.
additional_requirements: A list of additional requirements to be added to the `requirements.txt`.
memory_mbytes: The default memory allocation for the Actor run in MB.

Returns:
A resource client for the created Actor.
Expand All @@ -229,6 +228,7 @@ async def _make_actor(
main_py: str | None = None,
source_files: Mapping[str, str | bytes] | None = None,
additional_requirements: list[str] | None = None,
memory_mbytes: int = 256,
) -> ActorClientAsync:
if not (main_func or main_py or source_files):
raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified')
Expand Down Expand Up @@ -298,7 +298,7 @@ async def _make_actor(
created_actor = await client.actors().create(
name=actor_name,
default_run_build='latest',
default_run_memory_mbytes=256,
default_run_memory_mbytes=memory_mbytes,
default_run_timeout_secs=600,
versions=[
{
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext

from apify import Actor


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()

@crawler.router.default_handler
async def handler(context: AdaptivePlaywrightCrawlingContext) -> None:
pages_visited.append(context.request.url)
await context.enqueue_links()

if '/products/' in context.request.url:
name = context.parsed_content.css('h1::text').get('').strip()
price = context.parsed_content.css('span.price::text').get('').strip()
description = context.parsed_content.css('p.description::text').get('').strip()
if name:
await context.push_data(
{
'url': context.request.url,
'name': name,
'price': price,
'description': description,
}
)

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'AdaptivePlaywrightCrawler',
},
)
74 changes: 74 additions & 0 deletions tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

from html.parser import HTMLParser
from typing import TYPE_CHECKING

from crawlee.crawlers import BasicCrawler

from apify import Actor

if TYPE_CHECKING:
from crawlee._types import BasicCrawlingContext


class _PageParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.links: list[str] = []
self.data: dict[str, str] = {}
self._in_tag: str | None = None
self._in_class: str = ''

def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
attrs_dict = dict(attrs)
if tag == 'a' and (href := attrs_dict.get('href')):
self.links.append(href)
self._in_tag = tag
self._in_class = attrs_dict.get('class', '') or ''

def handle_endtag(self, tag: str) -> None: # noqa: ARG002
self._in_tag = None
self._in_class = ''

def handle_data(self, data: str) -> None:
text = data.strip()
if not text:
return
if self._in_tag == 'h1':
self.data['name'] = text
elif self._in_tag == 'span' and self._in_class == 'price':
self.data['price'] = text
elif self._in_tag == 'p' and self._in_class == 'description':
self.data['description'] = text


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = BasicCrawler()

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
pages_visited.append(context.request.url)

response = await context.send_request(context.request.url)
html = (await response.read()).decode()

parser = _PageParser()
parser.feed(html)

base_url = 'http://localhost:8080'
await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')])

if '/products/' in context.request.url and parser.data.get('name'):
await context.push_data({'url': context.request.url, **parser.data})

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'BasicCrawler',
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

from apify import Actor


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = BeautifulSoupCrawler()

@crawler.router.default_handler
async def handler(context: BeautifulSoupCrawlingContext) -> None:
pages_visited.append(context.request.url)
await context.enqueue_links()

if '/products/' in context.request.url:
name_tag = context.soup.find('h1')
price_tag = context.soup.find('span', class_='price')
desc_tag = context.soup.find('p', class_='description')
if name_tag:
await context.push_data(
{
'url': context.request.url,
'name': name_tag.get_text(strip=True),
'price': price_tag.get_text(strip=True) if price_tag else '',
'description': desc_tag.get_text(strip=True) if desc_tag else '',
}
)

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'BeautifulSoupCrawler',
},
)
46 changes: 46 additions & 0 deletions tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

import re

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

from apify import Actor


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = HttpCrawler()

@crawler.router.default_handler
async def handler(context: HttpCrawlingContext) -> None:
pages_visited.append(context.request.url)
html = (await context.http_response.read()).decode()

links = re.findall(r'href="(/[^"]*)"', html)
base_url = 'http://localhost:8080'
await context.add_requests([f'{base_url}{link}' for link in links])

if '/products/' in context.request.url:
name_match = re.search(r'<h1>(.*?)</h1>', html)
price_match = re.search(r'<span class="price">(.*?)</span>', html)
desc_match = re.search(r'<p class="description">(.*?)</p>', html)
if name_match:
await context.push_data(
{
'url': context.request.url,
'name': name_match.group(1),
'price': price_match.group(1) if price_match else '',
'description': desc_match.group(1) if desc_match else '',
}
)

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'HttpCrawler',
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

from apify import Actor


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = ParselCrawler()

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
pages_visited.append(context.request.url)
await context.enqueue_links()

if '/products/' in context.request.url:
name = context.selector.css('h1::text').get('').strip()
price = context.selector.css('span.price::text').get('').strip()
description = context.selector.css('p.description::text').get('').strip()
if name:
await context.push_data(
{
'url': context.request.url,
'name': name,
'price': price,
'description': description,
}
)

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'ParselCrawler',
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

from apify import Actor


async def main() -> None:
async with Actor:
pages_visited: list[str] = []
crawler = PlaywrightCrawler()

@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext) -> None:
pages_visited.append(context.request.url)
await context.enqueue_links()

if '/products/' in context.request.url:
name = await context.page.locator('h1').text_content()
price = await context.page.locator('span.price').text_content()
description = await context.page.locator('p.description').text_content()
if name:
await context.push_data(
{
'url': context.request.url,
'name': name.strip(),
'price': (price or '').strip(),
'description': (description or '').strip(),
}
)

await crawler.run(['http://localhost:8080/'])

await Actor.set_value(
'CRAWLER_RESULT',
{
'pages_visited_count': len(pages_visited),
'crawler_type': 'PlaywrightCrawler',
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER

COPY . ./

RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*

RUN pip install --force-reinstall -r requirements.txt

CMD ["sh", "-c", "python server.py & python -m src"]
Loading