diff --git a/agentstack/_tools/vision/__init__.py b/agentstack/_tools/vision/__init__.py index e491153a..8d67cfd4 100644 --- a/agentstack/_tools/vision/__init__.py +++ b/agentstack/_tools/vision/__init__.py @@ -1,70 +1,111 @@ -"""Vision tool for analyzing images using OpenAI's Vision API.""" - +from typing import IO, Optional +import os +from pathlib import Path import base64 -from typing import Optional +import tempfile import requests -from openai import OpenAI +import anthropic __all__ = ["analyze_image"] +PROMPT = os.getenv('VISION_PROMPT', "What's in this image?") +MODEL = os.getenv('VISION_MODEL', "claude-3-5-sonnet-20241022") +MAX_TOKENS: int = int(os.getenv('VISION_MAX_TOKENS', 1024)) -def analyze_image(image_path_url: str) -> str: - """ - Analyze an image using OpenAI's Vision API. +MEDIA_TYPES = { + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "png": "image/png", + "gif": "image/gif", + "webp": "image/webp", +} +ALLOWED_MEDIA_TYPES = list(MEDIA_TYPES.keys()) - Args: - image_path_url: Local path or URL to the image +# image sizes that will not be resized +# TODO is there any value in resizing pre-upload? +# 1:1 1092x1092 px +# 3:4 951x1268 px +# 2:3 896x1344 px +# 9:16 819x1456 px +# 1:2 784x1568 px - Returns: - str: Description of the image contents - """ - client = OpenAI() - if not image_path_url: - return "Image Path or URL is required." +def _get_media_type(image_filename: str) -> Optional[str]: + """Get the media type from an image filename.""" + for ext, media_type in MEDIA_TYPES.items(): + if image_filename.endswith(ext): + return media_type + return None + - if "http" in image_path_url: - return _analyze_web_image(client, image_path_url) - return _analyze_local_image(client, image_path_url) +def _encode_image(image_handle: IO) -> str: + """Encode a file handle to base64.""" + return base64.b64encode(image_handle.read()).decode("utf-8") -def _analyze_web_image(client: OpenAI, image_path_url: str) -> str: - response = client.chat.completions.create( - model="gpt-4-vision-preview", +def _make_anthropic_request(image_handle: IO, media_type: str) -> anthropic.types.Message: + """Make a request to the Anthropic API using an image.""" + client = anthropic.Anthropic() + data = _encode_image(image_handle) + return client.messages.create( + model=MODEL, + max_tokens=MAX_TOKENS, messages=[ { "role": "user", "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": image_path_url}}, + { # type: ignore + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": data, + }, + }, + { # type: ignore + "type": "text", + "text": PROMPT, + }, ], } ], - max_tokens=300, ) - return response.choices[0].message.content # type: ignore[return-value] -def _analyze_local_image(client: OpenAI, image_path: str) -> str: - base64_image = _encode_image(image_path) - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {client.api_key}"} - payload = { - "model": "gpt-4-vision-preview", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}, - ], - } - ], - "max_tokens": 300, - } - response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) - return response.json()["choices"][0]["message"]["content"] +def _analyze_web_image(image_url: str, media_type: str) -> str: + """Analyze an image from a URL.""" + with tempfile.NamedTemporaryFile() as temp_file: + temp_file.write(requests.get(image_url).content) + temp_file.flush() + temp_file.seek(0) + response = _make_anthropic_request(temp_file, media_type) + return response.content[0].text # type: ignore -def _encode_image(image_path: str) -> str: +def _analyze_local_image(image_path: str, media_type: str) -> str: + """Analyze an image from a local file.""" with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") + response = _make_anthropic_request(image_file, media_type) + return response.content[0].text # type: ignore + + +def analyze_image(image_path_or_url: str) -> str: + """ + Analyze an image using OpenAI's Vision API. + + Args: + image_path_or_url: Local path or URL to the image. + + Returns: + str: Description of the image contents + """ + if not image_path_or_url: + return "Image Path or URL is required." + + media_type = _get_media_type(image_path_or_url) + if not media_type: + return f"Unsupported image type use {ALLOWED_MEDIA_TYPES}." + + if "http" in image_path_or_url: + return _analyze_web_image(image_path_or_url, media_type) + return _analyze_local_image(image_path_or_url, media_type) diff --git a/agentstack/_tools/vision/config.json b/agentstack/_tools/vision/config.json index 37963f0d..0852aa20 100644 --- a/agentstack/_tools/vision/config.json +++ b/agentstack/_tools/vision/config.json @@ -2,10 +2,13 @@ "name": "vision", "category": "image-analysis", "env": { - "OPENAI_API_KEY": null + "ANTHROPIC_API_KEY": null, + "VISION_PROMPT": null, + "VISION_MODEL": null, + "VISION_MAX_TOKENS": null }, "dependencies": [ - "openai>=1.0.0", + "anthropic>=0.45.2", "requests>=2.31.0" ], "tools": ["analyze_image"] diff --git a/docs/llms.txt b/docs/llms.txt index 2e0af5b7..fa23ea5c 100644 --- a/docs/llms.txt +++ b/docs/llms.txt @@ -514,10 +514,6 @@ which adheres to a common pattern or exporting your project to share. Templates are versioned, and each previous version provides a method to convert it's content to the current version. -> TODO: Templates are currently identified as `proj_templates` since they conflict -with the templates used by `generation`. Move existing templates to be part of -the generation package. - ### `TemplateConfig.from_user_input(identifier: str)` `` Returns a `TemplateConfig` object for either a URL, file path, or builtin template name. @@ -716,7 +712,7 @@ title: 'System Analyzer' description: 'Inspect a project directory and improve it' --- -[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/system_analyzer.json) +[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/system_analyzer.json) ```bash agentstack init --template=system_analyzer @@ -737,7 +733,7 @@ title: 'Researcher' description: 'Research and report result from a query' --- -[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/research.json) +[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/research.json) ```bash agentstack init --template=research @@ -828,7 +824,54 @@ title: 'Content Creator' description: 'Research a topic and create content on it' --- -[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/content_creator.json) +[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/content_creator.json) + +## frameworks/list.mdx + +--- +title: Frameworks +description: 'Supported frameworks in AgentStack' +icon: 'ship' +--- + +These are documentation links to the frameworks supported directly by AgentStack. + +To start a project with one of these frameworks, use +```bash +agentstack init --framework +``` + +## Framework Docs + + + An intuitive agentic framework (recommended) + + + A complex but capable framework with a _steep_ learning curve + + + A simple framework with a cult following + + + An expansive framework with many ancillary features + + ## tools/package-structure.mdx @@ -1043,7 +1086,7 @@ You can pass the `--wizard` flag to `agentstack init` to use an interactive proj You can also pass a `--template=` argument to `agentstack init` which will pre-populate your project with functionality from a built-in template, or one found on the internet. A `template_name` can be one of three identifiers: -- A built-in AgentStack template (see the `templates/proj_templates` directory in the AgentStack repo for bundled templates). +- A built-in AgentStack template (see the `templates` directory in the AgentStack repo for bundled templates). - A template file from the internet; pass the full https URL of the template. - A local template file; pass an absolute or relative path. diff --git a/tests/fixtures/test_image.jpg b/tests/fixtures/test_image.jpg new file mode 100644 index 00000000..a30e369d Binary files /dev/null and b/tests/fixtures/test_image.jpg differ diff --git a/tests/tools/test_tool_vision.py b/tests/tools/test_tool_vision.py new file mode 100644 index 00000000..e04415bc --- /dev/null +++ b/tests/tools/test_tool_vision.py @@ -0,0 +1,56 @@ +import os +from pathlib import Path +import unittest +from agentstack._tools import ToolConfig + + +TEST_IMAGE_PATH: Path = Path(__file__).parent.parent / 'fixtures/test_image.jpg' + + +class VisionToolTest(unittest.TestCase): + def setUp(self): + tool = ToolConfig.from_tool_name('vision') + for dependency in tool.dependencies: + os.system(f"pip install {dependency}") + + try: + from agentstack._tools import vision + except ImportError as e: + self.skipTest(str(e)) + + def test_get_media_type(self): + from agentstack._tools.vision import _get_media_type + + self.assertEqual(_get_media_type("image.jpg"), "image/jpeg") + self.assertEqual(_get_media_type("image.jpeg"), "image/jpeg") + self.assertEqual(_get_media_type("http://google.com/image.png"), "image/png") + self.assertEqual(_get_media_type("/foo/bar/image.gif"), "image/gif") + self.assertEqual(_get_media_type("image.webp"), "image/webp") + self.assertEqual(_get_media_type("document.pdf"), None) + + def test_encode_image(self): + from agentstack._tools.vision import _encode_image + + with open(TEST_IMAGE_PATH, "rb") as image_file: + encoded_image = _encode_image(image_file) + print(encoded_image[:200]) + self.assertTrue(isinstance(encoded_image, str)) + + def test_analyze_image_web_live(self): + from agentstack._tools.vision import analyze_image + + if not os.environ.get('ANTHROPIC_API_KEY'): + self.skipTest("ANTHROPIC_API_KEY not set") + + image_url = "https://github.com/AgentOps-AI/AgentStack/blob/7c1bf897742cfb58f4942a2547be70a0a1bb767a/tests/fixtures/test_image.jpg?raw=true" + result = analyze_image(image_url) + self.assertTrue(isinstance(result, str)) + + def test_analyze_image_local_live(self): + from agentstack._tools.vision import analyze_image + + if not os.environ.get('ANTHROPIC_API_KEY'): + self.skipTest("ANTHROPIC_API_KEY not set") + + result = analyze_image(str(TEST_IMAGE_PATH)) + self.assertTrue(isinstance(result, str))