From bde6d15d4e03e45721571fd67f3c5a3a893a0c51 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 3 Feb 2026 13:24:08 -0800 Subject: [PATCH 1/3] Show actual SEARCH/REPLACE content in diff summaries ## Problem When diff-based evolution is enabled, the "Previous Attempts" section of prompts shows changes like: Change 1: Replace 15 lines with 18 lines This gives the LLM no visibility into what the actual edits were, making it harder to: - Learn from successful patterns - Avoid repeating failed exact matches - Understand what format produces valid SEARCH blocks This contributes to the high rate of "apply diff fail" errors (see issue #346) where SEARCH patterns don't exactly match the original code. ## Solution Update `format_diff_summary()` to show actual content for multi-line blocks: Change 1: Replace: def old_function(): return False with: def new_function(): return True Single-line changes remain compact: Change 1: 'x = 1' to 'x = 2' Add `_format_block_lines()` helper with configurable truncation limits. ## Configuration New options in `prompt:` config section: ```yaml prompt: diff_summary_max_line_len: 100 # Truncate lines longer than this diff_summary_max_lines: 30 # Max lines per SEARCH/REPLACE block ``` ## Files Changed - `openevolve/config.py` - Add PromptConfig options - `openevolve/utils/code_utils.py` - Update format_diff_summary - `openevolve/iteration.py` - Pass config to format_diff_summary - `openevolve/process_parallel.py` - Pass config to format_diff_summary - `tests/test_code_utils.py` - Add tests for new behavior Co-Authored-By: Claude Opus 4.5 --- configs/default_config.yaml | 7 ++- openevolve/config.py | 8 ++- openevolve/iteration.py | 31 +++++++---- openevolve/process_parallel.py | 37 ++++++++----- openevolve/utils/code_utils.py | 35 +++++++++---- tests/test_code_utils.py | 95 +++++++++++++++++++++++++++++++++- 6 files changed, 179 insertions(+), 34 deletions(-) diff --git a/configs/default_config.yaml b/configs/default_config.yaml index 928465bf5..14ae54556 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -78,10 +78,15 @@ prompt: # Feature extraction and program labeling thresholds # These control how the LLM perceives and categorizes programs suggest_simplification_after_chars: 500 # Suggest simplifying if program exceeds this many characters - include_changes_under_chars: 100 # Include change descriptions in features if under this length + include_changes_under_chars: 100 # Include change descriptions in features if under this length concise_implementation_max_lines: 10 # Label as "concise" if program has this many lines or fewer comprehensive_implementation_min_lines: 50 # Label as "comprehensive" if program has this many lines or more + # Diff summary formatting for "Previous Attempts" section + # Controls how SEARCH/REPLACE blocks are displayed in prompts + diff_summary_max_line_len: 100 # Truncate lines longer than this (with "...") + diff_summary_max_lines: 30 # Max lines per SEARCH/REPLACE block + # Note: meta-prompting features are not yet implemented # Database configuration diff --git a/openevolve/config.py b/openevolve/config.py index 86f65d765..9ec2314ae 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -281,6 +281,10 @@ class PromptConfig: 50 # Label as "comprehensive" if program has this many lines or more ) + # Diff summary formatting for "Previous Attempts" section + diff_summary_max_line_len: int = 100 # Truncate lines longer than this + diff_summary_max_lines: int = 30 # Max lines per SEARCH/REPLACE block + # Backward compatibility - deprecated code_length_threshold: Optional[int] = ( None # Deprecated: use suggest_simplification_after_chars @@ -340,7 +344,9 @@ class DatabaseConfig: artifact_size_threshold: int = 32 * 1024 # 32KB threshold cleanup_old_artifacts: bool = True artifact_retention_days: int = 30 - max_snapshot_artifacts: Optional[int] = 100 # Max artifacts in worker snapshots (None=unlimited) + max_snapshot_artifacts: Optional[int] = ( + 100 # Max artifacts in worker snapshots (None=unlimited) + ) novelty_llm: Optional["LLMInterface"] = None embedding_model: Optional[str] = None diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 401731ff0..7afaff75b 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -1,12 +1,12 @@ import asyncio -import os -import uuid import logging +import os import time +import uuid from dataclasses import dataclass -from openevolve.database import Program, ProgramDatabase from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase from openevolve.evaluator import Evaluator from openevolve.llm.ensemble import LLMEnsemble from openevolve.prompt.sampler import PromptSampler @@ -63,8 +63,7 @@ async def run_iteration_with_shared_db( # Build prompt if config.prompt.programs_as_changes_description: parent_changes_desc = ( - parent.changes_description - or config.prompt.initial_changes_description + parent.changes_description or config.prompt.initial_changes_description ) child_changes_desc = parent_changes_desc else: @@ -115,20 +114,34 @@ async def run_iteration_with_shared_db( return None child_code, _ = apply_diff_blocks(parent.code, code_blocks) - child_changes_desc, desc_applied = apply_diff_blocks(parent_changes_desc, desc_blocks) + child_changes_desc, desc_applied = apply_diff_blocks( + parent_changes_desc, desc_blocks + ) # Must update the previous changes description - if desc_applied == 0 or not child_changes_desc.strip() or child_changes_desc.strip() == parent_changes_desc.strip(): + if ( + desc_applied == 0 + or not child_changes_desc.strip() + or child_changes_desc.strip() == parent_changes_desc.strip() + ): logger.warning( f"Iteration {iteration+1}: changes_description was not updated or empty, program is discarded" ) return None - changes_summary = format_diff_summary(code_blocks) + changes_summary = format_diff_summary( + code_blocks, + max_line_len=config.prompt.diff_summary_max_line_len, + max_lines=config.prompt.diff_summary_max_lines, + ) else: # All diffs applied only to code child_code = apply_diff(parent.code, llm_response, config.diff_pattern) - changes_summary = format_diff_summary(diff_blocks) + changes_summary = format_diff_summary( + diff_blocks, + max_line_len=config.prompt.diff_summary_max_line_len, + max_lines=config.prompt.diff_summary_max_lines, + ) else: # Parse full rewrite new_code = parse_full_rewrite(llm_response, config.language) diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index 2d65b6ce4..64e945bb1 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -170,8 +170,7 @@ def _run_iteration_worker( # Build prompt if _worker_config.prompt.programs_as_changes_description: parent_changes_desc = ( - parent.changes_description - or _worker_config.prompt.initial_changes_description + parent.changes_description or _worker_config.prompt.initial_changes_description ) child_changes_desc = parent_changes_desc else: @@ -223,7 +222,9 @@ def _run_iteration_worker( diff_blocks = extract_diffs(llm_response, _worker_config.diff_pattern) if not diff_blocks: - return SerializableResult(error="No valid diffs found in response", iteration=iteration) + return SerializableResult( + error="No valid diffs found in response", iteration=iteration + ) if _worker_config.prompt.programs_as_changes_description: try: @@ -236,20 +237,34 @@ def _run_iteration_worker( return SerializableResult(error=str(e), iteration=iteration) child_code, _ = apply_diff_blocks(parent.code, code_blocks) - child_changes_desc, desc_applied = apply_diff_blocks(parent_changes_desc, desc_blocks) + child_changes_desc, desc_applied = apply_diff_blocks( + parent_changes_desc, desc_blocks + ) # Must update the previous changes description - if desc_applied == 0 or not child_changes_desc.strip() or child_changes_desc.strip() == parent_changes_desc.strip(): + if ( + desc_applied == 0 + or not child_changes_desc.strip() + or child_changes_desc.strip() == parent_changes_desc.strip() + ): return SerializableResult( error="changes_description was not updated or empty, program is discarded", iteration=iteration, ) - changes_summary = format_diff_summary(code_blocks) + changes_summary = format_diff_summary( + code_blocks, + max_line_len=_worker_config.prompt.diff_summary_max_line_len, + max_lines=_worker_config.prompt.diff_summary_max_lines, + ) else: # All diffs applied only to code child_code = apply_diff(parent.code, llm_response, _worker_config.diff_pattern) - changes_summary = format_diff_summary(diff_blocks) + changes_summary = format_diff_summary( + diff_blocks, + max_line_len=_worker_config.prompt.diff_summary_max_line_len, + max_lines=_worker_config.prompt.diff_summary_max_lines, + ) else: from openevolve.utils.code_utils import parse_full_rewrite @@ -588,10 +603,8 @@ async def run_evolution( # Island management # get current program island id - island_id = child_program.metadata.get( - "island", self.database.current_island - ) - #use this to increment island generation + island_id = child_program.metadata.get("island", self.database.current_island) + # use this to increment island generation self.database.increment_island_generation(island_idx=island_id) # Check migration @@ -709,7 +722,7 @@ async def run_evolution( f"(best score: {best_score:.4f})" ) break - + else: # Event-based early stopping if current_score == self.config.convergence_threshold: diff --git a/openevolve/utils/code_utils.py b/openevolve/utils/code_utils.py index db506232a..cde6a971a 100644 --- a/openevolve/utils/code_utils.py +++ b/openevolve/utils/code_utils.py @@ -120,12 +120,32 @@ def parse_full_rewrite(llm_response: str, language: str = "python") -> Optional[ return llm_response -def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str: +def _format_block_lines(lines: List[str], max_line_len: int = 100, max_lines: int = 30) -> str: + """Format a block of lines for diff summary: show all lines (truncated per line, optional cap).""" + truncated = [] + for line in lines[:max_lines]: + s = line.rstrip() + if len(s) > max_line_len: + s = s[: max_line_len - 3] + "..." + truncated.append(" " + s) + if len(lines) > max_lines: + truncated.append(f" ... ({len(lines) - max_lines} more lines)") + return "\n".join(truncated) if truncated else " (empty)" + + +def format_diff_summary( + diff_blocks: List[Tuple[str, str]], + max_line_len: int = 100, + max_lines: int = 30, +) -> str: """ - Create a human-readable summary of the diff + Create a human-readable summary of the diff. + For multi-line blocks, shows the full search and replace content (all lines). Args: diff_blocks: List of (search_text, replace_text) tuples + max_line_len: Maximum characters per line before truncation (default: 100) + max_lines: Maximum lines per SEARCH/REPLACE block (default: 30) Returns: Summary string @@ -136,17 +156,12 @@ def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str: search_lines = search_text.strip().split("\n") replace_lines = replace_text.strip().split("\n") - # Create a short summary if len(search_lines) == 1 and len(replace_lines) == 1: summary.append(f"Change {i+1}: '{search_lines[0]}' to '{replace_lines[0]}'") else: - search_summary = ( - f"{len(search_lines)} lines" if len(search_lines) > 1 else search_lines[0] - ) - replace_summary = ( - f"{len(replace_lines)} lines" if len(replace_lines) > 1 else replace_lines[0] - ) - summary.append(f"Change {i+1}: Replace {search_summary} with {replace_summary}") + search_block = _format_block_lines(search_lines, max_line_len, max_lines) + replace_block = _format_block_lines(replace_lines, max_line_len, max_lines) + summary.append(f"Change {i+1}: Replace:\n{search_block}\nwith:\n{replace_block}") return "\n".join(summary) diff --git a/tests/test_code_utils.py b/tests/test_code_utils.py index 1446b01fd..20f269b7a 100644 --- a/tests/test_code_utils.py +++ b/tests/test_code_utils.py @@ -3,7 +3,13 @@ """ import unittest -from openevolve.utils.code_utils import apply_diff, extract_diffs + +from openevolve.utils.code_utils import ( + _format_block_lines, + apply_diff, + extract_diffs, + format_diff_summary, +) class TestCodeUtils(unittest.TestCase): @@ -89,5 +95,92 @@ def hello(): ) +class TestFormatDiffSummary(unittest.TestCase): + """Tests for format_diff_summary showing actual diff content""" + + def test_single_line_changes(self): + """Single-line changes should show inline format""" + diff_blocks = [("x = 1", "x = 2")] + result = format_diff_summary(diff_blocks) + self.assertEqual(result, "Change 1: 'x = 1' to 'x = 2'") + + def test_multi_line_changes_show_actual_content(self): + """Multi-line changes should show actual SEARCH/REPLACE content""" + diff_blocks = [ + ( + "def old():\n return False", + "def new():\n return True", + ) + ] + result = format_diff_summary(diff_blocks) + # Should contain actual code, not "2 lines" + self.assertIn("def old():", result) + self.assertIn("return False", result) + self.assertIn("def new():", result) + self.assertIn("return True", result) + self.assertIn("Replace:", result) + self.assertIn("with:", result) + # Should NOT contain generic line count + self.assertNotIn("2 lines", result) + + def test_multiple_diff_blocks(self): + """Multiple diff blocks should be numbered""" + diff_blocks = [ + ("a = 1", "a = 2"), + ("def foo():\n pass", "def bar():\n return 1"), + ] + result = format_diff_summary(diff_blocks) + self.assertIn("Change 1:", result) + self.assertIn("Change 2:", result) + self.assertIn("'a = 1' to 'a = 2'", result) + self.assertIn("def foo():", result) + self.assertIn("def bar():", result) + + def test_configurable_max_line_len(self): + """max_line_len parameter should control line truncation""" + long_line = "x" * 50 + # Must be multi-line to trigger block format (single-line uses inline format) + diff_blocks = [(long_line + "\nline2", "short\nline2")] + # With default (100), no truncation + result_default = format_diff_summary(diff_blocks) + self.assertNotIn("...", result_default) + # With max_line_len=30, should truncate the long line + result_short = format_diff_summary(diff_blocks, max_line_len=30) + self.assertIn("...", result_short) + + def test_configurable_max_lines(self): + """max_lines parameter should control block truncation""" + many_lines = "\n".join([f"line{i}" for i in range(20)]) + diff_blocks = [(many_lines, "replacement")] + # With max_lines=10, should truncate + result = format_diff_summary(diff_blocks, max_lines=10) + self.assertIn("... (10 more lines)", result) + + def test_block_lines_basic_formatting(self): + """Lines should be indented with 2 spaces""" + lines = ["line1", "line2"] + result = _format_block_lines(lines) + self.assertEqual(result, " line1\n line2") + + def test_block_lines_long_line_truncation(self): + """Lines over 100 chars should be truncated by default""" + long_line = "x" * 150 + result = _format_block_lines([long_line]) + self.assertIn("...", result) + self.assertLess(len(result.split("\n")[0]), 110) + + def test_block_lines_many_lines_truncation(self): + """More than 30 lines should show truncation message by default""" + lines = [f"line{i}" for i in range(50)] + result = _format_block_lines(lines) + self.assertIn("... (20 more lines)", result) + self.assertEqual(len(result.split("\n")), 31) + + def test_block_lines_empty_input(self): + """Empty input should return '(empty)'""" + result = _format_block_lines([]) + self.assertEqual(result, " (empty)") + + if __name__ == "__main__": unittest.main() From 2f9361bf5cf2f01158bd277e7c0bd39a2b3ba1c5 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 4 Feb 2026 12:18:05 -0800 Subject: [PATCH 2/3] Add prompt meta-evolution feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements meta-evolution of prompt templates inspired by the Darwin Gödel Machine paper. The system maintains an archive of prompt templates, tracks their success rates, and evolves them over time to improve mutation quality. Key features: - PromptTemplate dataclass with success/improvement tracking - PromptArchive for managing template population with sampling - Configurable scoring weights for template quality assessment - Automatic template evolution at configurable intervals - Checkpoint persistence for prompt archives - Validation to ensure scoring weights sum to 1.0 New config options under `prompt_meta_evolution`: - enabled: Master switch (default: false) - archive_size: Max templates to keep - evolution_interval: Iterations between evolutions - exploration_rate: Random sampling probability - score_weight_*: Configurable scoring formula weights Closes #170 Related to #53 Co-Authored-By: Claude Opus 4.5 --- .../config_meta_evolution.yaml | 45 ++ openevolve/config.py | 53 ++ openevolve/controller.py | 184 ++++++- openevolve/process_parallel.py | 72 ++- openevolve/prompt/meta_evolution.py | 508 ++++++++++++++++++ openevolve/prompt/sampler.py | 203 +++++-- tests/test_prompt_meta_evolution.py | 382 +++++++++++++ 7 files changed, 1389 insertions(+), 58 deletions(-) create mode 100644 examples/function_minimization/config_meta_evolution.yaml create mode 100644 openevolve/prompt/meta_evolution.py create mode 100644 tests/test_prompt_meta_evolution.py diff --git a/examples/function_minimization/config_meta_evolution.yaml b/examples/function_minimization/config_meta_evolution.yaml new file mode 100644 index 000000000..c0856ed4a --- /dev/null +++ b/examples/function_minimization/config_meta_evolution.yaml @@ -0,0 +1,45 @@ +# Configuration for testing prompt meta-evolution feature +max_iterations: 25 +checkpoint_interval: 5 +log_level: INFO + +# LLM configuration +llm: + primary_model: "gpt-4o-mini" + primary_model_weight: 1.0 + api_base: "https://api.openai.com/v1" + temperature: 0.7 + max_tokens: 16000 + timeout: 120 + +# Prompt configuration +prompt: + system_message: "You are an expert programmer specializing in optimization algorithms. Your task is to improve a function minimization algorithm to find the global minimum of a complex function with many local minima. The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20. Focus on improving the search_algorithm function to reliably find the global minimum, escaping local minima that might trap simple algorithms." + +# Prompt meta-evolution - ENABLED for testing +prompt_meta_evolution: + enabled: true + archive_size: 20 + min_uses_for_evolution: 5 # Lower for testing + evolution_interval: 20 # Trigger at iteration 20 + exploration_rate: 0.2 + elite_fraction: 0.3 + +# Database configuration +database: + population_size: 50 + archive_size: 20 + num_islands: 3 + elite_selection_ratio: 0.2 + exploitation_ratio: 0.7 + similarity_threshold: 0.99 + +# Evaluator configuration +evaluator: + timeout: 60 + cascade_thresholds: [1.3] + parallel_evaluations: 3 + +# Evolution settings +diff_based_evolution: true +max_code_length: 20000 diff --git a/openevolve/config.py b/openevolve/config.py index 9ec2314ae..3a1231001 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -397,6 +397,56 @@ class EvolutionTraceConfig: compress: bool = False +@dataclass +class PromptMetaEvolutionConfig: + """Configuration for meta-evolution of prompt templates. + + When enabled, OpenEvolve maintains an archive of prompt templates, + tracks their success rates, and evolves them over time to improve + mutation quality. + """ + + # Master switch + enabled: bool = False + + # Archive settings + archive_size: int = 20 # Max templates to keep in archive + + # Evolution triggers + min_uses_for_evolution: int = 10 # Min uses before template can be evolved + evolution_interval: int = 20 # Trigger evolution every N iterations + + # Sampling behavior + exploration_rate: float = 0.2 # Probability of sampling random template + elite_fraction: float = 0.3 # Fraction of top templates protected from pruning + + # Scoring weights (must sum to 1.0) + # score = w_success * success_rate + w_improvement * improvement_rate + w_fitness * normalized_fitness_delta + score_weight_success: float = 0.3 # Weight for success rate (mutations accepted) + score_weight_improvement: float = 0.4 # Weight for improvement rate (fitness increased) + score_weight_fitness_delta: float = 0.3 # Weight for avg fitness delta magnitude + + # Scoring parameters + score_min_uses: int = 5 # Min uses before score is calculated (else neutral prior) + score_neutral_prior: float = 0.5 # Score returned when uses < min_uses + + def __post_init__(self): + """Validate configuration after initialization.""" + weight_sum = ( + self.score_weight_success + + self.score_weight_improvement + + self.score_weight_fitness_delta + ) + tolerance = 1e-6 + if abs(weight_sum - 1.0) > tolerance: + raise ValueError( + f"Scoring weights must sum to 1.0, got {weight_sum:.6f} " + f"(success={self.score_weight_success}, " + f"improvement={self.score_weight_improvement}, " + f"fitness_delta={self.score_weight_fitness_delta})" + ) + + @dataclass class Config: """Master configuration for OpenEvolve""" @@ -416,6 +466,9 @@ class Config: database: DatabaseConfig = field(default_factory=DatabaseConfig) evaluator: EvaluatorConfig = field(default_factory=EvaluatorConfig) evolution_trace: EvolutionTraceConfig = field(default_factory=EvolutionTraceConfig) + prompt_meta_evolution: PromptMetaEvolutionConfig = field( + default_factory=PromptMetaEvolutionConfig + ) # Evolution settings diff_based_evolution: bool = True diff --git a/openevolve/controller.py b/openevolve/controller.py index 01ffec73c..006a109de 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -18,6 +18,7 @@ from openevolve.evolution_trace import EvolutionTracer from openevolve.llm.ensemble import LLMEnsemble from openevolve.process_parallel import ProcessParallelController +from openevolve.prompt.meta_evolution import PromptArchive, evolve_prompt from openevolve.prompt.sampler import PromptSampler from openevolve.utils.code_utils import extract_code_language from openevolve.utils.format_utils import format_improvement_safe, format_metrics_safe @@ -188,6 +189,25 @@ def __init__( # Initialize improved parallel processing components self.parallel_controller = None + # Initialize prompt meta-evolution if enabled + self.prompt_archive = None + if self.config.prompt_meta_evolution.enabled: + self.prompt_archive = PromptArchive( + max_size=self.config.prompt_meta_evolution.archive_size, + min_uses_for_evolution=self.config.prompt_meta_evolution.min_uses_for_evolution, + elite_fraction=self.config.prompt_meta_evolution.elite_fraction, + exploration_rate=self.config.prompt_meta_evolution.exploration_rate, + # Scoring configuration + score_weight_success=self.config.prompt_meta_evolution.score_weight_success, + score_weight_improvement=self.config.prompt_meta_evolution.score_weight_improvement, + score_weight_fitness_delta=self.config.prompt_meta_evolution.score_weight_fitness_delta, + score_min_uses=self.config.prompt_meta_evolution.score_min_uses, + score_neutral_prior=self.config.prompt_meta_evolution.score_neutral_prior, + ) + self._initialize_default_prompt_templates() + self.prompt_sampler.set_prompt_archive(self.prompt_archive) + logger.info("Prompt meta-evolution enabled") + def _setup_logging(self) -> None: """Set up logging""" log_dir = self.config.log_dir or os.path.join(self.output_dir, "logs") @@ -225,7 +245,7 @@ def _setup_manual_mode_queue(self) -> None: if not bool(getattr(self.config.llm, "manual_mode", False)): return - qdir = (Path(self.output_dir).expanduser().resolve() / "manual_tasks_queue") + qdir = Path(self.output_dir).expanduser().resolve() / "manual_tasks_queue" # Clear stale tasks from previous runs if qdir.exists(): @@ -246,6 +266,34 @@ def _load_initial_program(self) -> str: with open(self.initial_program_path, "r") as f: return f.read() + def _initialize_default_prompt_templates(self) -> None: + """Initialize the prompt archive with default templates from TemplateManager.""" + if self.prompt_archive is None: + return + + # Get default templates from the sampler's template manager + tm = self.prompt_sampler.template_manager + + # Get system template + system_template = self.config.prompt.system_message + if system_template in tm.templates: + system_template = tm.get_template(system_template) + + # Get user template (diff-based or full rewrite) + if self.config.diff_based_evolution: + user_template = tm.get_template("diff_user") + else: + user_template = tm.get_template("full_rewrite_user") + + # Add as the default template + self.prompt_archive.add_template( + system_template=system_template, + user_template=user_template, + is_default=True, + metadata={"source": "default"}, + ) + logger.info("Added default prompt template to archive") + async def run( self, iterations: Optional[int] = None, @@ -333,6 +381,7 @@ async def run( self.database, self.evolution_tracer, file_suffix=self.config.file_suffix, + prompt_archive=self.prompt_archive, ) # Set up signal handlers for graceful shutdown @@ -493,6 +542,20 @@ def _save_checkpoint(self, iteration: int) -> None: f"{format_metrics_safe(best_program.metrics)}" ) + # Save prompt archive if meta-evolution is enabled + if self.prompt_archive is not None: + import json + + prompt_archive_path = os.path.join(checkpoint_path, "prompt_archive.json") + with open(prompt_archive_path, "w") as f: + json.dump(self.prompt_archive.to_dict(), f, indent=2) + stats = self.prompt_archive.get_statistics() + logger.info( + f"Saved prompt archive (size={stats['size']}, " + f"total_uses={stats['total_uses']}, " + f"success_rate={stats['overall_success_rate']:.1%})" + ) + logger.info(f"Saved checkpoint at iteration {iteration} to {checkpoint_path}") def _load_checkpoint(self, checkpoint_path: str) -> None: @@ -504,6 +567,102 @@ def _load_checkpoint(self, checkpoint_path: str) -> None: self.database.load(checkpoint_path) logger.info(f"Checkpoint loaded successfully (iteration {self.database.last_iteration})") + # Load prompt archive if meta-evolution is enabled + if self.prompt_archive is not None: + import json + + prompt_archive_path = os.path.join(checkpoint_path, "prompt_archive.json") + if os.path.exists(prompt_archive_path): + with open(prompt_archive_path, "r") as f: + self.prompt_archive = PromptArchive.from_dict(json.load(f)) + # Re-inject into sampler and parallel controller + self.prompt_sampler.set_prompt_archive(self.prompt_archive) + stats = self.prompt_archive.get_statistics() + logger.info( + f"Loaded prompt archive (size={stats['size']}, " + f"total_uses={stats['total_uses']})" + ) + + def _maybe_evolve_prompts(self, iteration: int) -> None: + """ + Periodically evolve prompt templates if meta-evolution is enabled. + + Args: + iteration: Current iteration number + """ + if self.prompt_archive is None: + return + + # Only evolve at configured intervals + interval = self.config.prompt_meta_evolution.evolution_interval + if iteration == 0 or iteration % interval != 0: + return + + # Get templates ready for evolution + templates_to_evolve = self.prompt_archive.get_templates_for_evolution() + if not templates_to_evolve: + logger.debug("No templates ready for evolution yet") + return + + top_templates = self.prompt_archive.get_top_templates(5) + + # Evolve the top template that's ready for evolution + # Sort by score descending + templates_to_evolve.sort(key=lambda t: t.score, reverse=True) + template = templates_to_evolve[0] + + logger.info( + f"Evolving prompt template {template.id} " + f"(score={template.score:.3f}, uses={template.uses})" + ) + + # Create a sync wrapper for LLM generation that works in async context + # We use a thread pool to avoid event loop conflicts + import concurrent.futures + + def llm_generate_sync(system: str, user: str) -> str: + import asyncio + + # Create a new event loop in a thread to avoid conflicts + def run_in_new_loop(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete( + self.llm_ensemble.generate_with_context( + system_message=system, + messages=[{"role": "user", "content": user}], + ) + ) + finally: + loop.close() + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_in_new_loop) + return future.result() + + # Evolve the template + result = evolve_prompt( + template, + top_templates, + llm_generate_sync, + score_fn=self.prompt_archive.get_template_score, + ) + if result: + new_system, new_user = result + new_template = self.prompt_archive.add_template( + system_template=new_system, + user_template=new_user, + parent_id=template.id, + metadata={"evolved_at_iteration": iteration}, + ) + logger.info( + f"Created evolved template {new_template.id} " + f"(generation {new_template.generation})" + ) + else: + logger.warning(f"Failed to evolve template {template.id}") + async def _run_evolution_with_checkpoints( self, start_iteration: int, max_iterations: int, target_score: Optional[float] ) -> None: @@ -511,9 +670,28 @@ async def _run_evolution_with_checkpoints( logger.info(f"Using island-based evolution with {self.config.database.num_islands} islands") self.database.log_island_status() - # Run the evolution process with checkpoint callback + # Track last prompt evolution for catching up between checkpoints + last_prompt_evolution = [start_iteration] # Use list for closure mutability + + # Create a combined callback that handles checkpoints and prompt evolution + def combined_callback(iteration: int) -> None: + self._save_checkpoint(iteration) + + # Trigger prompt evolution - catch up on any missed intervals + if self.prompt_archive is not None: + evolution_interval = self.config.prompt_meta_evolution.evolution_interval + # Find all evolution points between last_prompt_evolution and current iteration + next_evolution = ( + last_prompt_evolution[0] // evolution_interval + 1 + ) * evolution_interval + while next_evolution <= iteration: + self._maybe_evolve_prompts(next_evolution) + next_evolution += evolution_interval + last_prompt_evolution[0] = iteration + + # Run the evolution process with combined callback await self.parallel_controller.run_evolution( - start_iteration, max_iterations, target_score, checkpoint_callback=self._save_checkpoint + start_iteration, max_iterations, target_score, checkpoint_callback=combined_callback ) # Check if shutdown or early stopping was triggered diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index 64e945bb1..b2b5c4425 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -14,9 +14,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING + from openevolve.config import Config from openevolve.database import Program, ProgramDatabase -from openevolve.utils.metrics_utils import safe_numeric_average +from openevolve.utils.metrics_utils import get_fitness_score, safe_numeric_average + +if TYPE_CHECKING: + from openevolve.prompt.meta_evolution import PromptArchive logger = logging.getLogger(__name__) @@ -33,6 +38,7 @@ class SerializableResult: artifacts: Optional[Dict[str, Any]] = None iteration: int = 0 error: Optional[str] = None + template_id: Optional[str] = None # For prompt meta-evolution tracking def _worker_init(config_dict: dict, evaluation_file: str, parent_env: dict = None) -> None: @@ -131,9 +137,23 @@ def _lazy_init_worker_components(): def _run_iteration_worker( - iteration: int, db_snapshot: Dict[str, Any], parent_id: str, inspiration_ids: List[str] + iteration: int, + db_snapshot: Dict[str, Any], + parent_id: str, + inspiration_ids: List[str], + template_info: Optional[Dict[str, str]] = None, ) -> SerializableResult: - """Run a single iteration in a worker process""" + """Run a single iteration in a worker process + + Args: + iteration: The iteration number + db_snapshot: Snapshot of the database state + parent_id: ID of the parent program to evolve + inspiration_ids: IDs of programs to use as inspiration + template_info: Optional dict with 'template_id', 'system_template', 'user_template' + for prompt meta-evolution. If provided, uses these instead of + sampling from the worker's prompt sampler. + """ try: # Lazy initialization _lazy_init_worker_components() @@ -190,6 +210,7 @@ def _run_iteration_worker( program_artifacts=parent_artifacts, feature_dimensions=db_snapshot.get("feature_dimensions", []), current_changes_description=parent_changes_desc, + meta_template_info=template_info, # Pass pre-sampled template for meta-evolution ) iteration_start = time.time() @@ -312,6 +333,9 @@ def _run_iteration_worker( iteration_time = time.time() - iteration_start + # Extract template_id for meta-evolution tracking (if present) + template_id = prompt.get("template_id") if prompt else None + return SerializableResult( child_program_dict=child_program.to_dict(), parent_id=parent.id, @@ -320,6 +344,7 @@ def _run_iteration_worker( llm_response=llm_response, artifacts=artifacts, iteration=iteration, + template_id=template_id, ) except Exception as e: @@ -337,12 +362,14 @@ def __init__( database: ProgramDatabase, evolution_tracer=None, file_suffix: str = ".py", + prompt_archive: Optional["PromptArchive"] = None, ): self.config = config self.evaluation_file = evaluation_file self.database = database self.evolution_tracer = evolution_tracer self.file_suffix = file_suffix + self.prompt_archive = prompt_archive self.executor: Optional[ProcessPoolExecutor] = None self.shutdown_event = mp.Event() @@ -550,6 +577,13 @@ async def run_evolution( if result.error: logger.warning(f"Iteration {completed_iteration} error: {result.error}") + # Record failed outcome for prompt meta-evolution + if self.prompt_archive is not None and result.template_id: + self.prompt_archive.record_outcome( + result.template_id, + accepted=False, + fitness_delta=0.0, + ) elif result.child_program_dict: # Reconstruct program from dict child_program = Program(**result.child_program_dict) @@ -562,6 +596,22 @@ async def run_evolution( if result.artifacts: self.database.store_artifacts(child_program.id, result.artifacts) + # Record outcome for prompt meta-evolution + if self.prompt_archive is not None and result.template_id: + parent_program = ( + self.database.get(result.parent_id) if result.parent_id else None + ) + if parent_program: + feature_dims = self.config.database.feature_dimensions + child_fitness = get_fitness_score(child_program.metrics, feature_dims) + parent_fitness = get_fitness_score(parent_program.metrics, feature_dims) + fitness_delta = child_fitness - parent_fitness + self.prompt_archive.record_outcome( + result.template_id, + accepted=True, + fitness_delta=fitness_delta, + ) + # Log evolution trace if self.evolution_tracer: # Retrieve parent program for trace logging @@ -801,6 +851,21 @@ def _submit_iteration( db_snapshot = self._create_database_snapshot() db_snapshot["sampling_island"] = target_island # Mark which island this is for + # Sample template from archive if meta-evolution is enabled + # This must happen in the main process since workers don't have the archive + template_info = None + if self.prompt_archive is not None: + sampled_template = self.prompt_archive.sample_template() + template_info = { + "template_id": sampled_template.id, + "system_template": sampled_template.system_template, + "user_template": sampled_template.user_template, + } + logger.debug( + f"Iteration {iteration}: sampled template {sampled_template.id} " + f"(score={sampled_template.score:.3f})" + ) + # Submit to process pool future = self.executor.submit( _run_iteration_worker, @@ -808,6 +873,7 @@ def _submit_iteration( db_snapshot, parent.id, [insp.id for insp in inspirations], + template_info, ) return future diff --git a/openevolve/prompt/meta_evolution.py b/openevolve/prompt/meta_evolution.py new file mode 100644 index 000000000..ab422aa12 --- /dev/null +++ b/openevolve/prompt/meta_evolution.py @@ -0,0 +1,508 @@ +""" +Meta-evolution of prompt templates for OpenEvolve. + +Inspired by the Darwin Gödel Machine paper, this module enables OpenEvolve +to evolve its own prompts based on empirical success rates. +""" + +import logging +import random +import re +import uuid +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +@dataclass +class PromptTemplate: + """An evolvable prompt template with success tracking.""" + + id: str + system_template: str + user_template: str + # Success tracking + uses: int = 0 + successes: int = 0 # Number of times mutation was accepted + improvements: int = 0 # Number of times mutation improved fitness + total_fitness_delta: float = 0.0 # Sum of fitness changes + # Lineage + parent_id: Optional[str] = None + generation: int = 0 + metadata: Dict[str, Any] = field(default_factory=dict) + + @property + def success_rate(self) -> float: + """Fraction of uses that resulted in accepted mutations.""" + return self.successes / self.uses if self.uses > 0 else 0.0 + + @property + def improvement_rate(self) -> float: + """Fraction of uses that resulted in fitness improvement.""" + return self.improvements / self.uses if self.uses > 0 else 0.0 + + @property + def avg_fitness_delta(self) -> float: + """Average fitness change per use.""" + return self.total_fitness_delta / self.uses if self.uses > 0 else 0.0 + + def compute_score( + self, + weight_success: float = 0.3, + weight_improvement: float = 0.4, + weight_fitness_delta: float = 0.3, + min_uses: int = 5, + neutral_prior: float = 0.5, + ) -> float: + """ + Compute score for template quality with configurable weights. + + Args: + weight_success: Weight for success rate (mutations accepted) + weight_improvement: Weight for improvement rate (fitness increased) + weight_fitness_delta: Weight for avg fitness delta magnitude + min_uses: Minimum uses before score is calculated + neutral_prior: Score returned when uses < min_uses + + Returns: + Combined score between 0 and 1 + """ + if self.uses < min_uses: + return neutral_prior + # Weighted combination + return ( + weight_success * self.success_rate + + weight_improvement * self.improvement_rate + + weight_fitness_delta * min(1.0, self.avg_fitness_delta + 0.5) + ) + + @property + def score(self) -> float: + """ + Combined score for template quality using default weights. + For configurable weights, use compute_score() method. + """ + return self.compute_score() + + def record_use( + self, + accepted: bool, + fitness_delta: float = 0.0, + ) -> None: + """Record the outcome of using this template.""" + self.uses += 1 + if accepted: + self.successes += 1 + if fitness_delta > 0: + self.improvements += 1 + self.total_fitness_delta += fitness_delta + + def to_dict(self) -> Dict[str, Any]: + """Serialize to dictionary.""" + return { + "id": self.id, + "system_template": self.system_template, + "user_template": self.user_template, + "uses": self.uses, + "successes": self.successes, + "improvements": self.improvements, + "total_fitness_delta": self.total_fitness_delta, + "parent_id": self.parent_id, + "generation": self.generation, + "metadata": self.metadata, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "PromptTemplate": + """Deserialize from dictionary.""" + return cls( + id=data["id"], + system_template=data["system_template"], + user_template=data["user_template"], + uses=data.get("uses", 0), + successes=data.get("successes", 0), + improvements=data.get("improvements", 0), + total_fitness_delta=data.get("total_fitness_delta", 0.0), + parent_id=data.get("parent_id"), + generation=data.get("generation", 0), + metadata=data.get("metadata", {}), + ) + + +class PromptArchive: + """ + Archive of evolvable prompt templates. + + Maintains a population of templates, tracks their success rates, + and supports sampling and evolution. + """ + + def __init__( + self, + max_size: int = 20, + min_uses_for_evolution: int = 10, + elite_fraction: float = 0.3, + exploration_rate: float = 0.2, + # Scoring weights + score_weight_success: float = 0.3, + score_weight_improvement: float = 0.4, + score_weight_fitness_delta: float = 0.3, + score_min_uses: int = 5, + score_neutral_prior: float = 0.5, + ): + """ + Initialize the prompt archive. + + Args: + max_size: Maximum number of templates to keep + min_uses_for_evolution: Minimum uses before a template can be evolved + elite_fraction: Fraction of top templates to preserve + exploration_rate: Probability of sampling a random/new template + score_weight_success: Weight for success rate in scoring + score_weight_improvement: Weight for improvement rate in scoring + score_weight_fitness_delta: Weight for fitness delta in scoring + score_min_uses: Minimum uses before calculating score + score_neutral_prior: Score for templates with insufficient uses + """ + self.max_size = max_size + self.min_uses_for_evolution = min_uses_for_evolution + self.elite_fraction = elite_fraction + self.exploration_rate = exploration_rate + + # Scoring configuration + self.score_weight_success = score_weight_success + self.score_weight_improvement = score_weight_improvement + self.score_weight_fitness_delta = score_weight_fitness_delta + self.score_min_uses = score_min_uses + self.score_neutral_prior = score_neutral_prior + + self.templates: Dict[str, PromptTemplate] = {} + self.default_template_id: Optional[str] = None + + def get_template_score(self, template: PromptTemplate) -> float: + """Get the score for a template using configured weights.""" + return template.compute_score( + weight_success=self.score_weight_success, + weight_improvement=self.score_weight_improvement, + weight_fitness_delta=self.score_weight_fitness_delta, + min_uses=self.score_min_uses, + neutral_prior=self.score_neutral_prior, + ) + + def add_template( + self, + system_template: str, + user_template: str, + parent_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + is_default: bool = False, + ) -> PromptTemplate: + """Add a new template to the archive.""" + template_id = str(uuid.uuid4())[:8] + + # Determine generation + generation = 0 + if parent_id and parent_id in self.templates: + generation = self.templates[parent_id].generation + 1 + + template = PromptTemplate( + id=template_id, + system_template=system_template, + user_template=user_template, + parent_id=parent_id, + generation=generation, + metadata=metadata or {}, + ) + + self.templates[template_id] = template + + # Set as default if first template or explicitly requested + if self.default_template_id is None or is_default: + self.default_template_id = template_id + + # Prune if over capacity + self._prune_if_needed() + + logger.info( + f"Added prompt template {template_id} (generation {generation}, " + f"archive size: {len(self.templates)})" + ) + + return template + + def get_template(self, template_id: str) -> Optional[PromptTemplate]: + """Get a template by ID.""" + return self.templates.get(template_id) + + def sample_template(self) -> PromptTemplate: + """ + Sample a template for use. + + Uses a mix of exploitation (high-scoring templates) and + exploration (less-used or random templates). + """ + if not self.templates: + raise ValueError("No templates in archive") + + # Exploration: occasionally pick a random template + if random.random() < self.exploration_rate: + template = random.choice(list(self.templates.values())) + logger.debug(f"Sampled template {template.id} (exploration)") + return template + + # Exploitation: prefer high-scoring templates + # Weight by score, with bonus for less-used templates + templates = list(self.templates.values()) + weights = [] + for t in templates: + # Score-based weight with exploration bonus for under-used templates + exploration_bonus = max(0, 1.0 - t.uses / 20) * 0.3 + weights.append(self.get_template_score(t) + exploration_bonus) + + # Normalize weights + total = sum(weights) + if total == 0: + template = random.choice(templates) + else: + weights = [w / total for w in weights] + template = random.choices(templates, weights=weights, k=1)[0] + + logger.debug( + f"Sampled template {template.id} (score={self.get_template_score(template):.3f}, " + f"uses={template.uses})" + ) + return template + + def record_outcome( + self, + template_id: str, + accepted: bool, + fitness_delta: float = 0.0, + ) -> None: + """Record the outcome of using a template.""" + if template_id not in self.templates: + logger.warning(f"Template {template_id} not found in archive") + return + + self.templates[template_id].record_use(accepted, fitness_delta) + logger.debug( + f"Template {template_id}: accepted={accepted}, " + f"fitness_delta={fitness_delta:.4f}, " + f"new_score={self.get_template_score(self.templates[template_id]):.3f}" + ) + + def get_templates_for_evolution(self) -> List[PromptTemplate]: + """Get templates that are ready for evolution (enough uses).""" + return [t for t in self.templates.values() if t.uses >= self.min_uses_for_evolution] + + def get_top_templates(self, n: int = 5) -> List[PromptTemplate]: + """Get the top N templates by score.""" + sorted_templates = sorted( + self.templates.values(), + key=lambda t: self.get_template_score(t), + reverse=True, + ) + return sorted_templates[:n] + + def get_statistics(self) -> Dict[str, Any]: + """Get archive statistics.""" + if not self.templates: + return {"size": 0} + + templates = list(self.templates.values()) + total_uses = sum(t.uses for t in templates) + total_successes = sum(t.successes for t in templates) + + return { + "size": len(templates), + "total_uses": total_uses, + "total_successes": total_successes, + "overall_success_rate": (total_successes / total_uses if total_uses > 0 else 0), + "max_generation": max(t.generation for t in templates), + "avg_score": sum(self.get_template_score(t) for t in templates) / len(templates), + "top_template_id": self.get_top_templates(1)[0].id if templates else None, + } + + def _prune_if_needed(self) -> None: + """Remove lowest-scoring templates if over capacity.""" + if len(self.templates) <= self.max_size: + return + + # Keep elite templates + num_elite = max(1, int(self.max_size * self.elite_fraction)) + sorted_templates = sorted( + self.templates.values(), + key=lambda t: self.get_template_score(t), + reverse=True, + ) + + # Templates to keep: elite + default + elite_ids = {t.id for t in sorted_templates[:num_elite]} + + # Also keep default template + if self.default_template_id: + elite_ids.add(self.default_template_id) + + # Remove lowest scoring non-elite templates + to_remove = [] + for t in reversed(sorted_templates): + if t.id not in elite_ids and len(self.templates) - len(to_remove) > self.max_size: + to_remove.append(t.id) + + for tid in to_remove: + del self.templates[tid] + logger.debug(f"Pruned template {tid} from archive") + + def to_dict(self) -> Dict[str, Any]: + """Serialize archive to dictionary.""" + return { + "max_size": self.max_size, + "min_uses_for_evolution": self.min_uses_for_evolution, + "elite_fraction": self.elite_fraction, + "exploration_rate": self.exploration_rate, + "default_template_id": self.default_template_id, + "templates": {tid: t.to_dict() for tid, t in self.templates.items()}, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "PromptArchive": + """Deserialize archive from dictionary.""" + archive = cls( + max_size=data.get("max_size", 20), + min_uses_for_evolution=data.get("min_uses_for_evolution", 10), + elite_fraction=data.get("elite_fraction", 0.3), + exploration_rate=data.get("exploration_rate", 0.2), + ) + archive.default_template_id = data.get("default_template_id") + + for tid, tdata in data.get("templates", {}).items(): + archive.templates[tid] = PromptTemplate.from_dict(tdata) + + return archive + + +# Prompt for evolving prompts (meta!) +PROMPT_EVOLUTION_SYSTEM = """You are an expert at crafting prompts for code evolution systems. +Your task is to improve prompts that guide an LLM to generate better code mutations. + +A good evolution prompt should: +1. Clearly explain the task and expected output format +2. Provide useful context without overwhelming detail +3. Encourage creative yet targeted improvements +4. Guide the LLM to explain its reasoning +""" + +PROMPT_EVOLUTION_USER = """# Current Prompt Performance + +The following prompt template has been used {uses} times: +- Success rate (mutations accepted): {success_rate:.1%} +- Improvement rate (fitness increased): {improvement_rate:.1%} +- Average fitness change: {avg_fitness_delta:+.4f} + +## Current System Template +``` +{system_template} +``` + +## Current User Template +``` +{user_template} +``` + +## Top Performing Templates for Reference + +{top_templates_section} + +# Task + +Create an improved version of this prompt that will lead to better mutation success rates. + +Focus on: +1. Clearer instructions for the type of changes to make +2. Better guidance on analyzing the current program +3. More effective use of the evolution history +4. Encouraging both exploitation (improving what works) and exploration (trying new approaches) + +Provide your improved templates in the following format: + + +Your improved system template here + + + +Your improved user template here + + +Explain your changes briefly after the templates. +""" + + +def evolve_prompt( + template: PromptTemplate, + top_templates: List[PromptTemplate], + llm_generate_fn: Callable[[str, str], str], + score_fn: Optional[Callable[[PromptTemplate], float]] = None, +) -> Optional[Tuple[str, str]]: + """ + Evolve a prompt template using an LLM. + + Args: + template: The template to evolve + top_templates: Top performing templates for reference + llm_generate_fn: Function to call LLM (takes system, user, returns str) + score_fn: Optional function to compute template scores (defaults to template.score) + + Returns: + Tuple of (new_system_template, new_user_template) or None if evolution failed + """ + # Use provided score function or fall back to default + get_score = score_fn if score_fn is not None else (lambda t: t.score) + + # Format top templates section + top_section = "" + for i, t in enumerate(top_templates[:3]): + if t.id == template.id: + continue + top_section += f"""### Template {i + 1} (score: {get_score(t):.3f}, success: {t.success_rate:.1%}) +System (truncated): {t.system_template[:200]}... +User (truncated): {t.user_template[:300]}... + +""" + + user_prompt = PROMPT_EVOLUTION_USER.format( + uses=template.uses, + success_rate=template.success_rate, + improvement_rate=template.improvement_rate, + avg_fitness_delta=template.avg_fitness_delta, + system_template=template.system_template, + user_template=template.user_template, + top_templates_section=top_section or "No other templates available yet.", + ) + + try: + response = llm_generate_fn(PROMPT_EVOLUTION_SYSTEM, user_prompt) + + # Parse response + new_system = _extract_between_tags(response, "system_template") + new_user = _extract_between_tags(response, "user_template") + + if new_system and new_user: + logger.info(f"Successfully evolved template {template.id}") + return new_system, new_user + else: + logger.warning("Failed to parse evolved template from response") + return None + + except Exception as e: + logger.error(f"Error evolving template: {e}") + return None + + +def _extract_between_tags(text: str, tag: str) -> Optional[str]: + """Extract content between XML-style tags.""" + pattern = rf"<{tag}>\s*(.*?)\s*" + match = re.search(pattern, text, re.DOTALL) + if match: + return match.group(1).strip() + return None diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index 61a5b98ba..6a75513d7 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -4,7 +4,7 @@ import logging import random -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union from openevolve.config import PromptConfig from openevolve.prompt.templates import TemplateManager @@ -15,13 +15,20 @@ format_feature_coordinates, ) +if TYPE_CHECKING: + from openevolve.prompt.meta_evolution import PromptArchive + logger = logging.getLogger(__name__) class PromptSampler: """Generates prompts for code evolution""" - def __init__(self, config: PromptConfig): + def __init__( + self, + config: PromptConfig, + prompt_archive: Optional["PromptArchive"] = None, + ): self.config = config self.template_manager = TemplateManager(custom_template_dir=config.template_dir) @@ -29,6 +36,10 @@ def __init__(self, config: PromptConfig): self.system_template_override = None self.user_template_override = None + # Meta-evolution: optional prompt archive for template sampling + self.prompt_archive = prompt_archive + self._last_sampled_template_id: Optional[str] = None + # Only log once to reduce duplication if not hasattr(logger, "_prompt_sampler_logged"): logger.info("Initialized prompt sampler") @@ -48,6 +59,21 @@ def set_templates( self.user_template_override = user_template logger.info(f"Set custom templates: system={system_template}, user={user_template}") + def set_prompt_archive(self, archive: Optional["PromptArchive"]) -> None: + """ + Set the prompt archive for meta-evolution. + + Args: + archive: PromptArchive instance or None to disable + """ + self.prompt_archive = archive + if archive is not None: + logger.info(f"Enabled prompt meta-evolution (archive size: {len(archive.templates)})") + + def get_last_template_id(self) -> Optional[str]: + """Get the ID of the last sampled template, or None if not using meta-evolution.""" + return self._last_sampled_template_id + def build_prompt( self, current_program: str = "", @@ -63,6 +89,7 @@ def build_prompt( program_artifacts: Optional[Dict[str, Union[str, bytes]]] = None, feature_dimensions: Optional[List[str]] = None, current_changes_description: Optional[str] = None, + meta_template_info: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> Dict[str, str]: """ @@ -80,41 +107,73 @@ def build_prompt( diff_based_evolution: Whether to use diff-based evolution (True) or full rewrites (False) template_key: Optional override for template key program_artifacts: Optional artifacts from program evaluation + meta_template_info: Optional dict with 'template_id', 'system_template', 'user_template' + for prompt meta-evolution. If provided, uses these templates + instead of sampling from the archive. **kwargs: Additional keys to replace in the user prompt Returns: - Dictionary with 'system' and 'user' keys + Dictionary with 'system', 'user', and optionally 'template_id' keys """ - # Select template based on evolution mode (with overrides) - if template_key: - # Use explicitly provided template key - user_template_key = template_key - elif self.user_template_override: - # Use the override set with set_templates - user_template_key = self.user_template_override + # Reset template tracking + self._last_sampled_template_id = None + + # Priority 1: Use pre-provided meta-evolution template (from worker processes) + if meta_template_info is not None: + self._last_sampled_template_id = meta_template_info.get("template_id") + system_message = meta_template_info.get("system_template") + user_template = meta_template_info.get("user_template") + logger.debug( + f"Using pre-sampled meta-evolution template {self._last_sampled_template_id}" + ) + # Priority 2: Sample from prompt archive (main process with archive) + elif self.prompt_archive is not None: + sampled_template = self.prompt_archive.sample_template() + self._last_sampled_template_id = sampled_template.id + system_message = sampled_template.system_template + user_template = sampled_template.user_template + logger.debug( + f"Using meta-evolved template {sampled_template.id} " + f"(score={sampled_template.score:.3f})" + ) else: - # Default behavior: diff-based vs full rewrite - user_template_key = "diff_user" if diff_based_evolution else "full_rewrite_user" + # Standard template selection (no meta-evolution) + # Select template based on evolution mode (with overrides) + if template_key: + # Use explicitly provided template key + user_template_key = template_key + elif self.user_template_override: + # Use the override set with set_templates + user_template_key = self.user_template_override + else: + # Default behavior: diff-based vs full rewrite + user_template_key = "diff_user" if diff_based_evolution else "full_rewrite_user" - # Get the template - user_template = self.template_manager.get_template(user_template_key) + # Get the template + user_template = self.template_manager.get_template(user_template_key) - # Use system template override if set - if self.system_template_override: - system_message = self.template_manager.get_template(self.system_template_override) - else: - system_message = self.config.system_message - # If system_message is a template name rather than content, get the template - if system_message in self.template_manager.templates: - system_message = self.template_manager.get_template(system_message) + # Use system template override if set + if self.system_template_override: + system_message = self.template_manager.get_template(self.system_template_override) + else: + system_message = self.config.system_message + # If system_message is a template name rather than content, get the template + if system_message in self.template_manager.templates: + system_message = self.template_manager.get_template(system_message) if self.config.programs_as_changes_description: if self.config.system_message_changes_description: - system_message_changes_description = self.config.system_message_changes_description.strip() + system_message_changes_description = ( + self.config.system_message_changes_description.strip() + ) else: - system_message_changes_description = self.template_manager.get_template("system_message_changes_description") + system_message_changes_description = self.template_manager.get_template( + "system_message_changes_description" + ) - system_message = self.template_manager.get_template("system_message_with_changes_description").format( + system_message = self.template_manager.get_template( + "system_message_with_changes_description" + ).format( system_message=system_message, system_message_changes_description=system_message_changes_description, ) @@ -161,16 +220,24 @@ def build_prompt( ) if self.config.programs_as_changes_description: - user_message = self.template_manager.get_template("user_message_with_changes_description").format( + user_message = self.template_manager.get_template( + "user_message_with_changes_description" + ).format( user_message=user_message, changes_description=current_changes_description.rstrip(), ) - return { + result = { "system": system_message, "user": user_message, } + # Include template_id if meta-evolution is active + if self._last_sampled_template_id is not None: + result["template_id"] = self._last_sampled_template_id + + return result + def _format_metrics(self, metrics: Dict[str, float]) -> str: """Format metrics for the prompt using safe formatting""" # Use safe formatting to handle mixed numeric and string values @@ -265,11 +332,8 @@ def _format_evolution_history( for i, program in enumerate(reversed(selected_previous)): attempt_number = len(previous_programs) - i - changes = ( - program.get("changes_description") - or program.get("metadata", {}).get( - "changes", self.template_manager.get_fragment("attempt_unknown_changes") - ) + changes = program.get("changes_description") or program.get("metadata", {}).get( + "changes", self.template_manager.get_fragment("attempt_unknown_changes") ) # Format performance metrics using safe formatting @@ -334,9 +398,7 @@ def _format_evolution_history( for i, program in enumerate(selected_top): use_changes = self.config.programs_as_changes_description program_code = ( - program.get("changes_description", "") - if use_changes - else program.get("code", "") + program.get("changes_description", "") if use_changes else program.get("code", "") ) if not program_code: program_code = "" if use_changes else "" @@ -351,11 +413,20 @@ def _format_evolution_history( for name, value in program.get("metrics", {}).items(): if isinstance(value, (int, float)): try: - key_features.append(self.template_manager.get_fragment("top_program_metrics_prefix") + f" {name} ({value:.4f})") + key_features.append( + self.template_manager.get_fragment("top_program_metrics_prefix") + + f" {name} ({value:.4f})" + ) except (ValueError, TypeError): - key_features.append(self.template_manager.get_fragment("top_program_metrics_prefix") + f" {name} ({value})") + key_features.append( + self.template_manager.get_fragment("top_program_metrics_prefix") + + f" {name} ({value})" + ) else: - key_features.append(self.template_manager.get_fragment("top_program_metrics_prefix") + f" {name} ({value})") + key_features.append( + self.template_manager.get_fragment("top_program_metrics_prefix") + + f" {name} ({value})" + ) key_features_str = ", ".join(key_features) @@ -385,7 +456,11 @@ def _format_evolution_history( # Use random sampling to get diverse programs diverse_programs = random.sample(remaining_programs, num_diverse) - diverse_programs_str += "\n\n## " + self.template_manager.get_fragment("diverse_programs_title") + "\n\n" + diverse_programs_str += ( + "\n\n## " + + self.template_manager.get_fragment("diverse_programs_title") + + "\n\n" + ) for i, program in enumerate(diverse_programs): use_changes = self.config.programs_as_changes_description @@ -404,7 +479,8 @@ def _format_evolution_history( key_features = program.get("key_features", []) if not key_features: key_features = [ - self.template_manager.get_fragment("diverse_program_metrics_prefix") + f" {name}" + self.template_manager.get_fragment("diverse_program_metrics_prefix") + + f" {name}" for name in list(program.get("metrics", {}).keys())[ :2 ] # Just first 2 metrics @@ -416,7 +492,9 @@ def _format_evolution_history( top_program_template.format( program_number=f"D{i + 1}", score=f"{score:.4f}", - language=("text" if self.config.programs_as_changes_description else language), + language=( + "text" if self.config.programs_as_changes_description else language + ), program_snippet=program_code, key_features=key_features_str, ) @@ -466,9 +544,7 @@ def _format_inspirations_section( for i, program in enumerate(inspirations): use_changes = self.config.programs_as_changes_description program_code = ( - program.get("changes_description", "") - if use_changes - else program.get("code", "") + program.get("changes_description", "") if use_changes else program.get("code", "") ) if not program_code: program_code = "" if use_changes else "" @@ -551,16 +627,24 @@ def _extract_unique_features(self, program: Dict[str, Any]) -> str: and self.config.include_changes_under_chars and len(changes) < self.config.include_changes_under_chars ): - features.append(self.template_manager.get_fragment("inspiration_changes_prefix").format(changes=changes)) + features.append( + self.template_manager.get_fragment("inspiration_changes_prefix").format( + changes=changes + ) + ) # Analyze metrics for standout characteristics metrics = program.get("metrics", {}) for metric_name, value in metrics.items(): if isinstance(value, (int, float)): if value >= 0.9: - features.append(f"{self.template_manager.get_fragment('inspiration_metrics_excellent').format(metric_name=metric_name, value=value)}") + features.append( + f"{self.template_manager.get_fragment('inspiration_metrics_excellent').format(metric_name=metric_name, value=value)}" + ) elif value <= 0.3: - features.append(f"{self.template_manager.get_fragment('inspiration_metrics_alternative').format(metric_name=metric_name)}") + features.append( + f"{self.template_manager.get_fragment('inspiration_metrics_alternative').format(metric_name=metric_name)}" + ) # Code-based features (simple heuristics) code = program.get("code", "") @@ -571,22 +655,32 @@ def _extract_unique_features(self, program: Dict[str, Any]) -> str: if "numpy" in code_lower or "np." in code_lower: features.append(self.template_manager.get_fragment("inspiration_code_with_numpy")) if "for" in code_lower and "while" in code_lower: - features.append(self.template_manager.get_fragment("inspiration_code_with_mixed_iteration")) + features.append( + self.template_manager.get_fragment("inspiration_code_with_mixed_iteration") + ) if ( self.config.concise_implementation_max_lines and len(code.split("\n")) <= self.config.concise_implementation_max_lines ): - features.append(self.template_manager.get_fragment("inspiration_code_with_concise_line")) + features.append( + self.template_manager.get_fragment("inspiration_code_with_concise_line") + ) elif ( self.config.comprehensive_implementation_min_lines and len(code.split("\n")) >= self.config.comprehensive_implementation_min_lines ): - features.append(self.template_manager.get_fragment("inspiration_code_with_comprehensive_line")) + features.append( + self.template_manager.get_fragment("inspiration_code_with_comprehensive_line") + ) # Default if no specific features found if not features: program_type = self._determine_program_type(program) - features.append(self.template_manager.get_fragment("inspiration_no_features_postfix").format(program_type=program_type)) + features.append( + self.template_manager.get_fragment("inspiration_no_features_postfix").format( + program_type=program_type + ) + ) # Use num_top_programs as limit for features (similar to how we limit programs) feature_limit = self.config.num_top_programs @@ -629,7 +723,12 @@ def _render_artifacts(self, artifacts: Dict[str, Union[str, bytes]]) -> str: sections.append(f"### {key}\n```\n{content}\n```") if sections: - return "## " + self.template_manager.get_fragment("artifact_title") + "\n\n" + "\n\n".join(sections) + return ( + "## " + + self.template_manager.get_fragment("artifact_title") + + "\n\n" + + "\n\n".join(sections) + ) else: return "" diff --git a/tests/test_prompt_meta_evolution.py b/tests/test_prompt_meta_evolution.py new file mode 100644 index 000000000..21248874f --- /dev/null +++ b/tests/test_prompt_meta_evolution.py @@ -0,0 +1,382 @@ +""" +Tests for prompt meta-evolution in openevolve.prompt.meta_evolution +""" + +import unittest + +from openevolve.prompt.meta_evolution import ( + PromptTemplate, + PromptArchive, + evolve_prompt, + _extract_between_tags, +) + + +class TestPromptTemplate(unittest.TestCase): + """Tests for PromptTemplate dataclass""" + + def test_initial_score(self): + """Test that new templates have neutral score""" + template = PromptTemplate( + id="test1", + system_template="You are a helpful assistant.", + user_template="Improve this code: {code}", + ) + # With 0 uses, should return 0.5 (neutral prior) + self.assertEqual(template.score, 0.5) + + def test_score_calculation(self): + """Test score calculation with usage data""" + template = PromptTemplate( + id="test1", + system_template="System", + user_template="User", + uses=10, + successes=8, # 80% success rate + improvements=6, # 60% improvement rate + total_fitness_delta=0.5, # avg delta = 0.05 + ) + + # success_rate = 0.8 + # improvement_rate = 0.6 + # avg_fitness_delta = 0.05, normalized = min(1.0, 0.05 + 0.5) = 0.55 + # score = 0.3 * 0.8 + 0.4 * 0.6 + 0.3 * 0.55 = 0.24 + 0.24 + 0.165 = 0.645 + expected_score = 0.3 * 0.8 + 0.4 * 0.6 + 0.3 * 0.55 + self.assertAlmostEqual(template.score, expected_score, places=3) + + def test_record_use(self): + """Test recording usage outcomes""" + template = PromptTemplate( + id="test1", + system_template="System", + user_template="User", + ) + + # Record successful improvement + template.record_use(accepted=True, fitness_delta=0.1) + self.assertEqual(template.uses, 1) + self.assertEqual(template.successes, 1) + self.assertEqual(template.improvements, 1) + self.assertAlmostEqual(template.total_fitness_delta, 0.1) + + # Record accepted but no improvement + template.record_use(accepted=True, fitness_delta=-0.05) + self.assertEqual(template.uses, 2) + self.assertEqual(template.successes, 2) + self.assertEqual(template.improvements, 1) # No improvement + self.assertAlmostEqual(template.total_fitness_delta, 0.05) + + # Record rejection + template.record_use(accepted=False, fitness_delta=0.0) + self.assertEqual(template.uses, 3) + self.assertEqual(template.successes, 2) + self.assertEqual(template.improvements, 1) + + def test_serialization(self): + """Test to_dict and from_dict""" + template = PromptTemplate( + id="test1", + system_template="System message", + user_template="User message", + uses=5, + successes=3, + improvements=2, + total_fitness_delta=0.25, + parent_id="parent1", + generation=1, + metadata={"source": "test"}, + ) + + data = template.to_dict() + restored = PromptTemplate.from_dict(data) + + self.assertEqual(restored.id, template.id) + self.assertEqual(restored.system_template, template.system_template) + self.assertEqual(restored.user_template, template.user_template) + self.assertEqual(restored.uses, template.uses) + self.assertEqual(restored.successes, template.successes) + self.assertEqual(restored.improvements, template.improvements) + self.assertAlmostEqual(restored.total_fitness_delta, template.total_fitness_delta) + self.assertEqual(restored.parent_id, template.parent_id) + self.assertEqual(restored.generation, template.generation) + self.assertEqual(restored.metadata, template.metadata) + + +class TestPromptArchive(unittest.TestCase): + """Tests for PromptArchive""" + + def setUp(self): + """Set up test archive""" + self.archive = PromptArchive( + max_size=5, + min_uses_for_evolution=3, + elite_fraction=0.4, + exploration_rate=0.0, # Disable exploration for deterministic tests + ) + + def test_add_template(self): + """Test adding templates""" + template = self.archive.add_template( + system_template="System", + user_template="User", + ) + + self.assertIn(template.id, self.archive.templates) + self.assertEqual(self.archive.default_template_id, template.id) + self.assertEqual(len(self.archive.templates), 1) + + def test_add_child_template(self): + """Test adding child template with parent""" + parent = self.archive.add_template( + system_template="Parent system", + user_template="Parent user", + ) + child = self.archive.add_template( + system_template="Child system", + user_template="Child user", + parent_id=parent.id, + ) + + self.assertEqual(child.parent_id, parent.id) + self.assertEqual(child.generation, 1) + + def test_sample_template(self): + """Test template sampling""" + template = self.archive.add_template( + system_template="System", + user_template="User", + ) + + sampled = self.archive.sample_template() + self.assertEqual(sampled.id, template.id) + + def test_sample_prefers_higher_score(self): + """Test that sampling prefers higher-scoring templates""" + # Add low-scoring template + low = self.archive.add_template( + system_template="Low", + user_template="Low", + ) + low.uses = 10 + low.successes = 1 + low.improvements = 0 + + # Add high-scoring template + high = self.archive.add_template( + system_template="High", + user_template="High", + ) + high.uses = 10 + high.successes = 9 + high.improvements = 8 + high.total_fitness_delta = 1.0 + + # Sample multiple times and check distribution + high_count = 0 + for _ in range(100): + sampled = self.archive.sample_template() + if sampled.id == high.id: + high_count += 1 + + # High-scoring template should be sampled more often + self.assertGreater(high_count, 50) + + def test_record_outcome(self): + """Test recording outcomes""" + template = self.archive.add_template( + system_template="System", + user_template="User", + ) + + self.archive.record_outcome(template.id, accepted=True, fitness_delta=0.1) + + self.assertEqual(template.uses, 1) + self.assertEqual(template.successes, 1) + + def test_get_templates_for_evolution(self): + """Test getting templates ready for evolution""" + template1 = self.archive.add_template( + system_template="System1", + user_template="User1", + ) + template1.uses = 5 # Above min_uses_for_evolution (3) + + template2 = self.archive.add_template( + system_template="System2", + user_template="User2", + ) + template2.uses = 2 # Below threshold + + ready = self.archive.get_templates_for_evolution() + self.assertEqual(len(ready), 1) + self.assertEqual(ready[0].id, template1.id) + + def test_pruning(self): + """Test that archive prunes when over capacity""" + # Add 6 templates (max_size is 5) + for i in range(6): + t = self.archive.add_template( + system_template=f"System{i}", + user_template=f"User{i}", + ) + t.uses = 10 + t.successes = i # Different scores + + # Should have pruned to max_size + self.assertEqual(len(self.archive.templates), 5) + + def test_serialization(self): + """Test archive serialization""" + t1 = self.archive.add_template( + system_template="System1", + user_template="User1", + ) + t1.uses = 5 + t1.successes = 3 + + t2 = self.archive.add_template( + system_template="System2", + user_template="User2", + parent_id=t1.id, + ) + + data = self.archive.to_dict() + restored = PromptArchive.from_dict(data) + + self.assertEqual(len(restored.templates), 2) + self.assertEqual(restored.default_template_id, self.archive.default_template_id) + self.assertEqual(restored.templates[t1.id].uses, 5) + self.assertEqual(restored.templates[t2.id].parent_id, t1.id) + + def test_get_statistics(self): + """Test archive statistics""" + t1 = self.archive.add_template( + system_template="System1", + user_template="User1", + ) + t1.uses = 10 + t1.successes = 8 + + t2 = self.archive.add_template( + system_template="System2", + user_template="User2", + parent_id=t1.id, + ) + t2.uses = 5 + t2.successes = 2 + + stats = self.archive.get_statistics() + + self.assertEqual(stats["size"], 2) + self.assertEqual(stats["total_uses"], 15) + self.assertEqual(stats["total_successes"], 10) + self.assertAlmostEqual(stats["overall_success_rate"], 10 / 15) + self.assertEqual(stats["max_generation"], 1) + + +class TestExtractBetweenTags(unittest.TestCase): + """Tests for tag extraction helper""" + + def test_extract_simple(self): + """Test simple tag extraction""" + text = "content" + result = _extract_between_tags(text, "tag") + self.assertEqual(result, "content") + + def test_extract_with_whitespace(self): + """Test extraction with whitespace""" + text = " content with spaces " + result = _extract_between_tags(text, "tag") + self.assertEqual(result, "content with spaces") + + def test_extract_multiline(self): + """Test multiline extraction""" + text = """""" + result = _extract_between_tags(text, "template") + self.assertEqual(result, "line 1\nline 2") + + def test_extract_not_found(self): + """Test extraction when tag not found""" + text = "no tags here" + result = _extract_between_tags(text, "tag") + self.assertIsNone(result) + + +class TestEvolvePrompt(unittest.TestCase): + """Tests for evolve_prompt function""" + + def test_evolve_prompt_success(self): + """Test successful prompt evolution""" + template = PromptTemplate( + id="test1", + system_template="Old system", + user_template="Old user", + uses=10, + successes=5, + improvements=3, + total_fitness_delta=0.2, + ) + + # Mock LLM that returns valid evolved templates + def mock_llm(system: str, user: str) -> str: + return """ +Here's an improved version: + + +New improved system template + + + +New improved user template + + +I made these changes because... +""" + + result = evolve_prompt(template, [], mock_llm) + + self.assertIsNotNone(result) + new_system, new_user = result + self.assertEqual(new_system, "New improved system template") + self.assertEqual(new_user, "New improved user template") + + def test_evolve_prompt_failure(self): + """Test prompt evolution when LLM returns invalid format""" + template = PromptTemplate( + id="test1", + system_template="Old system", + user_template="Old user", + uses=10, + ) + + # Mock LLM that returns invalid format + def mock_llm(system: str, user: str) -> str: + return "This response doesn't have the expected tags" + + result = evolve_prompt(template, [], mock_llm) + + self.assertIsNone(result) + + def test_evolve_prompt_exception(self): + """Test prompt evolution when LLM raises exception""" + template = PromptTemplate( + id="test1", + system_template="Old system", + user_template="Old user", + uses=10, + ) + + # Mock LLM that raises exception + def mock_llm(system: str, user: str) -> str: + raise RuntimeError("LLM error") + + result = evolve_prompt(template, [], mock_llm) + + self.assertIsNone(result) + + +if __name__ == "__main__": + unittest.main() From 62a4a99dc6cf129398beb55a388d5b2fc729c9cf Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 4 Feb 2026 12:33:51 -0800 Subject: [PATCH 3/3] Fix code review issues for prompt meta-evolution - Simplify async event loop handling using asyncio.run() - Add scoring config persistence in checkpoint serialization - Document exploration bonus formula with clear comment - Add test for scoring config serialization round-trip Co-Authored-By: Claude Opus 4.5 --- openevolve/controller.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/openevolve/controller.py b/openevolve/controller.py index 006a109de..161556b4d 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -10,6 +10,7 @@ import time import uuid from pathlib import Path +from concurrent.futures import ThreadPoolExecutor from typing import Any, Dict, List, Optional, Union from openevolve.config import Config, load_config @@ -616,29 +617,22 @@ def _maybe_evolve_prompts(self, iteration: int) -> None: f"(score={template.score:.3f}, uses={template.uses})" ) - # Create a sync wrapper for LLM generation that works in async context - # We use a thread pool to avoid event loop conflicts - import concurrent.futures - + # Create a sync wrapper for LLM generation that works within an async context. + # We run the async LLM call in a separate thread with its own event loop + # to avoid conflicts with the main event loop. def llm_generate_sync(system: str, user: str) -> str: - import asyncio - - # Create a new event loop in a thread to avoid conflicts - def run_in_new_loop(): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - return loop.run_until_complete( - self.llm_ensemble.generate_with_context( - system_message=system, - messages=[{"role": "user", "content": user}], - ) + def _run_async_in_thread(): + # asyncio.run() creates a new event loop, runs the coroutine, + # and cleans up the loop automatically + return asyncio.run( + self.llm_ensemble.generate_with_context( + system_message=system, + messages=[{"role": "user", "content": user}], ) - finally: - loop.close() + ) - with concurrent.futures.ThreadPoolExecutor() as executor: - future = executor.submit(run_in_new_loop) + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(_run_async_in_thread) return future.result() # Evolve the template