agentstack-ai · bboynton97 · Feb 13, 2025 · Feb 7, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/agentstack/_tools/vision/__init__.py b/agentstack/_tools/vision/__init__.py
@@ -1,70 +1,111 @@
-"""Vision tool for analyzing images using OpenAI's Vision API."""
-
+from typing import IO, Optional
+import os
+from pathlib import Path
 import base64
-from typing import Optional
+import tempfile
 import requests
-from openai import OpenAI
+import anthropic
 
 __all__ = ["analyze_image"]
 
+PROMPT = os.getenv('VISION_PROMPT', "What's in this image?")
+MODEL = os.getenv('VISION_MODEL', "claude-3-5-sonnet-20241022")
+MAX_TOKENS: int = int(os.getenv('VISION_MAX_TOKENS', 1024))
 
-def analyze_image(image_path_url: str) -> str:
-    """
-    Analyze an image using OpenAI's Vision API.
+MEDIA_TYPES = {
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "png": "image/png",
+    "gif": "image/gif",
+    "webp": "image/webp",
+}
+ALLOWED_MEDIA_TYPES = list(MEDIA_TYPES.keys())
 
-    Args:
-        image_path_url: Local path or URL to the image
+# image sizes that will not be resized
+# TODO is there any value in resizing pre-upload?
+# 1:1	1092x1092 px
+# 3:4	951x1268 px
+# 2:3	896x1344 px
+# 9:16	819x1456 px
+# 1:2	784x1568 px
 
-    Returns:
-        str: Description of the image contents
-    """
-    client = OpenAI()
 
-    if not image_path_url:
-        return "Image Path or URL is required."
+def _get_media_type(image_filename: str) -> Optional[str]:
+    """Get the media type from an image filename."""
+    for ext, media_type in MEDIA_TYPES.items():
+        if image_filename.endswith(ext):
+            return media_type
+    return None
+
 
-    if "http" in image_path_url:
-        return _analyze_web_image(client, image_path_url)
-    return _analyze_local_image(client, image_path_url)
+def _encode_image(image_handle: IO) -> str:
+    """Encode a file handle to base64."""
+    return base64.b64encode(image_handle.read()).decode("utf-8")
 
 
-def _analyze_web_image(client: OpenAI, image_path_url: str) -> str:
-    response = client.chat.completions.create(
-        model="gpt-4-vision-preview",
+def _make_anthropic_request(image_handle: IO, media_type: str) -> anthropic.types.Message:
+    """Make a request to the Anthropic API using an image."""
+    client = anthropic.Anthropic()
+    data = _encode_image(image_handle)
+    return client.messages.create(
+        model=MODEL,
+        max_tokens=MAX_TOKENS,
         messages=[
             {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": "What's in this image?"},
-                    {"type": "image_url", "image_url": {"url": image_path_url}},
+                    {  # type: ignore
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": media_type,
+                            "data": data,
+                        },
+                    },
+                    {  # type: ignore
+                        "type": "text",
+                        "text": PROMPT,
+                    },
                 ],
             }
         ],
-        max_tokens=300,
     )
-    return response.choices[0].message.content  # type: ignore[return-value]
 
 
-def _analyze_local_image(client: OpenAI, image_path: str) -> str:
-    base64_image = _encode_image(image_path)
-    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {client.api_key}"}
-    payload = {
-        "model": "gpt-4-vision-preview",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What's in this image?"},
-                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
-                ],
-            }
-        ],
-        "max_tokens": 300,
-    }
-    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
-    return response.json()["choices"][0]["message"]["content"]
+def _analyze_web_image(image_url: str, media_type: str) -> str:
+    """Analyze an image from a URL."""
+    with tempfile.NamedTemporaryFile() as temp_file:
+        temp_file.write(requests.get(image_url).content)
+        temp_file.flush()
+        temp_file.seek(0)
+        response = _make_anthropic_request(temp_file, media_type)
+        return response.content[0].text  # type: ignore
 
 
-def _encode_image(image_path: str) -> str:
+def _analyze_local_image(image_path: str, media_type: str) -> str:
+    """Analyze an image from a local file."""
     with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
+        response = _make_anthropic_request(image_file, media_type)
+        return response.content[0].text  # type: ignore
+
+
+def analyze_image(image_path_or_url: str) -> str:
+    """
+    Analyze an image using OpenAI's Vision API.
+
+    Args:
+        image_path_or_url: Local path or URL to the image.
+
+    Returns:
+        str: Description of the image contents
+    """
+    if not image_path_or_url:
+        return "Image Path or URL is required."
+
+    media_type = _get_media_type(image_path_or_url)
+    if not media_type:
+        return f"Unsupported image type use {ALLOWED_MEDIA_TYPES}."
+
+    if "http" in image_path_or_url:
+        return _analyze_web_image(image_path_or_url, media_type)
+    return _analyze_local_image(image_path_or_url, media_type)
diff --git a/agentstack/_tools/vision/config.json b/agentstack/_tools/vision/config.json
@@ -2,10 +2,13 @@
   "name": "vision",
   "category": "image-analysis",
   "env": {
-    "OPENAI_API_KEY": null
+    "ANTHROPIC_API_KEY": null, 
+    "VISION_PROMPT": null, 
+    "VISION_MODEL": null, 
+    "VISION_MAX_TOKENS": null
   },
   "dependencies": [
-    "openai>=1.0.0",
+    "anthropic>=0.45.2",
     "requests>=2.31.0"
   ],
   "tools": ["analyze_image"]

diff --git a/docs/llms.txt b/docs/llms.txt
@@ -514,10 +514,6 @@ which adheres to a common pattern or exporting your project to share.
 Templates are versioned, and each previous version provides a method to convert
 it's content to the current version. 
 
-> TODO: Templates are currently identified as `proj_templates` since they conflict
-with the templates used by `generation`. Move existing templates to be part of
-the generation package. 
-
 ### `TemplateConfig.from_user_input(identifier: str)`
 `<TemplateConfig>` Returns a `TemplateConfig` object for either a URL, file path,
 or builtin template name.
@@ -716,7 +712,7 @@ title: 'System Analyzer'
 description: 'Inspect a project directory and improve it'
 ---
 
-[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/system_analyzer.json)
+[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/system_analyzer.json)
 
 ```bash
 agentstack init --template=system_analyzer
@@ -737,7 +733,7 @@ title: 'Researcher'
 description: 'Research and report result from a query'
 ---
 
-[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/research.json)
+[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/research.json)
 
 ```bash
 agentstack init --template=research
@@ -828,7 +824,54 @@ title: 'Content Creator'
 description: 'Research a topic and create content on it'
 ---
 
-[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/proj_templates/content_creator.json)
+[View Template](https://github.com/AgentOps-AI/AgentStack/blob/main/agentstack/templates/content_creator.json)
+
+## frameworks/list.mdx
+
+---
+title: Frameworks
+description: 'Supported frameworks in AgentStack'
+icon: 'ship'
+---
+
+These are documentation links to the frameworks supported directly by AgentStack.
+
+To start a project with one of these frameworks, use
+```bash
+agentstack init <project_name> --framework <framework_name>
+```
+
+## Framework Docs
+<CardGroup cols={3}>
+  <Card
+    title="CrewAI"
+    icon="ship"
+    href="https://docs.crewai.com/introduction"
+  >
+    An intuitive agentic framework (recommended)
+  </Card>
+    <Card
+    title="LangGraph"
+    icon="circle-nodes"
+    href="https://langchain-ai.github.io/langgraph/"
+  >
+    A complex but capable framework with a _steep_ learning curve
+  </Card>
+    <Card
+    title="OpenAI Swarms"
+    icon="bee"
+    href="https://github.com/openai/swarm"
+  >
+    A simple framework with a cult following
+  </Card>
+    <Card
+    title="LlamaIndex"
+    icon="layer-group"
+    href="https://docs.llamaindex.ai/en/stable/"
+  >
+    An expansive framework with many ancillary features
+  </Card>
+</CardGroup>
 
 ## tools/package-structure.mdx
 
@@ -1043,7 +1086,7 @@ You can pass the `--wizard` flag to `agentstack init` to use an interactive proj
 You can also pass a `--template=<template_name>` argument to `agentstack init` which will pre-populate your project with functionality
 from a built-in template, or one found on the internet. A `template_name` can be one of three identifiers:
 
-- A built-in AgentStack template (see the `templates/proj_templates` directory in the AgentStack repo for bundled templates).
+- A built-in AgentStack template (see the `templates` directory in the AgentStack repo for bundled templates).
 - A template file from the internet; pass the full https URL of the template.
 - A local template file; pass an absolute or relative path. 
 

diff --git a/tests/fixtures/test_image.jpg b/tests/fixtures/test_image.jpg
diff --git a/tests/tools/test_tool_vision.py b/tests/tools/test_tool_vision.py
@@ -0,0 +1,56 @@
+import os
+from pathlib import Path
+import unittest
+from agentstack._tools import ToolConfig
+
+
+TEST_IMAGE_PATH: Path = Path(__file__).parent.parent / 'fixtures/test_image.jpg'
+
+
+class VisionToolTest(unittest.TestCase):
+    def setUp(self):
+        tool = ToolConfig.from_tool_name('vision')
+        for dependency in tool.dependencies:
+            os.system(f"pip install {dependency}")
+
+        try:
+            from agentstack._tools import vision
+        except ImportError as e:
+            self.skipTest(str(e))
+
+    def test_get_media_type(self):
+        from agentstack._tools.vision import _get_media_type
+
+        self.assertEqual(_get_media_type("image.jpg"), "image/jpeg")
+        self.assertEqual(_get_media_type("image.jpeg"), "image/jpeg")
+        self.assertEqual(_get_media_type("http://google.com/image.png"), "image/png")
+        self.assertEqual(_get_media_type("/foo/bar/image.gif"), "image/gif")
+        self.assertEqual(_get_media_type("image.webp"), "image/webp")
+        self.assertEqual(_get_media_type("document.pdf"), None)
+
+    def test_encode_image(self):
+        from agentstack._tools.vision import _encode_image
+
+        with open(TEST_IMAGE_PATH, "rb") as image_file:
+            encoded_image = _encode_image(image_file)
+            print(encoded_image[:200])
+            self.assertTrue(isinstance(encoded_image, str))
+
+    def test_analyze_image_web_live(self):
+        from agentstack._tools.vision import analyze_image
+
+        if not os.environ.get('ANTHROPIC_API_KEY'):
+            self.skipTest("ANTHROPIC_API_KEY not set")
+
+        image_url = "https://github.com/AgentOps-AI/AgentStack/blob/7c1bf897742cfb58f4942a2547be70a0a1bb767a/tests/fixtures/test_image.jpg?raw=true"
+        result = analyze_image(image_url)
+        self.assertTrue(isinstance(result, str))
+
+    def test_analyze_image_local_live(self):
+        from agentstack._tools.vision import analyze_image
+
+        if not os.environ.get('ANTHROPIC_API_KEY'):
+            self.skipTest("ANTHROPIC_API_KEY not set")
+
+        result = analyze_image(str(TEST_IMAGE_PATH))
+        self.assertTrue(isinstance(result, str))