From 46925e1142087ee52dc47d61d8d64b35fa7617ce Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 23 Jan 2026 00:35:11 +0000
Subject: [PATCH 1/5] fix: use lynx to fetch actual website content for
 extension checks

Instead of passing a simple message as the diff payload, the scheduled
extension checker now uses lynx to fetch the actual content from both
the main conference website and the CFP link page. This provides Claude
with real website content to analyze for deadline extensions rather than
just receiving an internal trigger message.
---
 .../workflows/scheduled-extension-checker.yml | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scheduled-extension-checker.yml b/.github/workflows/scheduled-extension-checker.yml
index 77895f2b39..c92e0fc782 100644
--- a/.github/workflows/scheduled-extension-checker.yml
+++ b/.github/workflows/scheduled-extension-checker.yml
@@ -100,9 +100,11 @@ jobs:
                       print(f"Skip: {name} - already extended, past deadline", file=sys.stderr)
                       continue
 
+                  cfp_link = str(conf.get("cfp_link", ""))
                   closing_soon.append({
                       "conference": name,
                       "url": url,
+                      "cfp_link": cfp_link,
                       "cfp_date": str(cfp_date),
                       "has_extension": has_extension,
                       "check_type": check_type,
@@ -151,11 +153,15 @@ jobs:
       fail-fast: false
 
     steps:
-      - name: Trigger extension check
+      - name: Install lynx
+        run: sudo apt-get update && sudo apt-get install -y lynx
+
+      - name: Fetch website content and trigger check
         env:
           # Pass matrix values via env for safety
           CONF_NAME: ${{ matrix.conference.conference }}
           CONF_URL: ${{ matrix.conference.url }}
+          CONF_CFP_LINK: ${{ matrix.conference.cfp_link }}
           CONF_REASON: ${{ matrix.conference.reason }}
           CONF_CFP_DATE: ${{ matrix.conference.cfp_date }}
           CONF_CHECK_TYPE: ${{ matrix.conference.check_type }}
@@ -163,11 +169,38 @@ jobs:
           GH_TOKEN: ${{ github.token }}
         run: |
           echo "Triggering check for: $CONF_NAME"
-          echo "URL: $CONF_URL"
+          echo "Main URL: $CONF_URL"
+          echo "CFP Link: $CONF_CFP_LINK"
           echo "Reason: $CONF_REASON"
           echo "CFP Date: $CONF_CFP_DATE"
           echo "Check Type: $CONF_CHECK_TYPE"
 
+          # Fetch main website content
+          echo "Fetching main website content..."
+          MAIN_CONTENT=""
+          if [ -n "$CONF_URL" ]; then
+            MAIN_CONTENT=$(lynx -dump -nolist "$CONF_URL" 2>/dev/null | head -500 || echo "Failed to fetch main website")
+          fi
+
+          # Fetch CFP link content
+          echo "Fetching CFP link content..."
+          CFP_CONTENT=""
+          if [ -n "$CONF_CFP_LINK" ] && [ "$CONF_CFP_LINK" != "$CONF_URL" ]; then
+            CFP_CONTENT=$(lynx -dump -nolist "$CONF_CFP_LINK" 2>/dev/null | head -500 || echo "Failed to fetch CFP page")
+          fi
+
+          # Combine content for analysis
+          COMBINED_CONTENT="=== EXTENSION CHECK: $CONF_REASON ===
+
+=== MAIN WEBSITE ($CONF_URL) ===
+$MAIN_CONTENT
+
+=== CFP PAGE ($CONF_CFP_LINK) ===
+$CFP_CONTENT"
+
+          # Truncate if too long (GitHub has payload limits)
+          COMBINED_CONTENT=$(echo "$COMBINED_CONTENT" | head -c 60000)
+
           # Use gh CLI for safe API call
           # Pass trigger context so triage workflow can adjust prompt accordingly
           gh api repos/${{ github.repository }}/dispatches \
@@ -175,13 +208,13 @@ jobs:
             -f "client_payload[url]=$CONF_URL" \
             -f "client_payload[title]=$CONF_NAME" \
             -f "client_payload[watch_uuid]=" \
-            -f "client_payload[diff]=Scheduled extension check: $CONF_REASON" \
+            -f "client_payload[diff]=$COMBINED_CONTENT" \
             -f "client_payload[source]=scheduled-checker" \
             -f "client_payload[trigger_reason]=$CONF_CHECK_TYPE" \
             -f "client_payload[original_cfp_deadline]=$CONF_CFP_DATE" \
             -f "client_payload[has_extension]=$CONF_HAS_EXT"
 
-          echo "✓ Triggered"
+          echo "✓ Triggered with website content"
 
       - name: Rate limit pause
         run: sleep 10  # Generous pause between triggers

From fdb07f3151725a00a9a146c71d59b151a02aff0c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 23 Jan 2026 00:38:39 +0000
Subject: [PATCH 2/5] fix: improve CFP link validation to exclude non-main CFP
 pages

The content_contains_cfp_info() function now properly distinguishes
between main CFP pages (talks/papers/proposals) and other "Call for X"
pages like sponsors, volunteers, reviewers, specialist tracks, etc.

Added:
- Negative indicators for non-CFP pages (sponsors, volunteers, etc.)
- Positive indicators for main CFP pages (call for papers, submit a talk, etc.)
- Logic to reject pages where non-CFP indicators dominate without any
  main CFP indicators present

This prevents cfp_link from being incorrectly populated with links to
sponsor pages, volunteer sign-ups, or sub-track CFPs.
---
 utils/enrich_tba.py | 72 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py
index 42f14b645d..2550724a58 100644
--- a/utils/enrich_tba.py
+++ b/utils/enrich_tba.py
@@ -401,7 +401,69 @@ def content_contains_cfp_info(content: str) -> bool:
     """
     content_lower = content.lower()
 
-    # Must contain deadline-related keywords
+    # Negative indicators - if these dominate the page, it's NOT a main CFP
+    # These are "Call for X" pages that aren't about talk/paper submissions
+    non_cfp_indicators = [
+        "call for sponsors",
+        "call for volunteers",
+        "call for reviewers",
+        "call for mentors",
+        "call for organizers",
+        "call for committee",
+        "become a sponsor",
+        "sponsorship opportunities",
+        "sponsorship package",
+        "volunteer sign",
+        "volunteer registration",
+        "reviewer application",
+        "program committee",
+        "organizing committee",
+        "specialist track",  # Sub-track CFPs, not main CFP
+        "special interest",
+        "birds of a feather",
+        "bof session",
+    ]
+
+    # Count non-CFP indicators
+    non_cfp_count = sum(1 for indicator in non_cfp_indicators if indicator in content_lower)
+
+    # Positive indicators - these suggest it's about main talk/paper submissions
+    main_cfp_indicators = [
+        "call for papers",
+        "call for proposals",
+        "call for presentations",
+        "call for talks",
+        "call for speakers",
+        "submit a talk",
+        "submit a paper",
+        "submit a proposal",
+        "submit your talk",
+        "submit your paper",
+        "submit your proposal",
+        "submission deadline",
+        "proposal deadline",
+        "paper deadline",
+        "talk deadline",
+        "abstract deadline",
+        "cfp deadline",
+        "speaker submission",
+        "talk submission",
+        "paper submission",
+        "proposal submission",
+    ]
+
+    # Count main CFP indicators
+    main_cfp_count = sum(1 for indicator in main_cfp_indicators if indicator in content_lower)
+
+    # If non-CFP indicators dominate and no strong main CFP indicators, reject
+    if non_cfp_count > 0 and main_cfp_count == 0:
+        return False
+
+    # If strong main CFP indicators are present, accept even if some non-CFP indicators exist
+    if main_cfp_count >= 2:
+        return True
+
+    # For borderline cases, require deadline-related keywords
     deadline_keywords = [
         "deadline",
         "submit",
@@ -435,7 +497,13 @@ def content_contains_cfp_info(content: str) -> bool:
         r"\b202[4-9]\b",
     ]
 
-    return any(re.search(pattern, content_lower) for pattern in date_patterns)
+    has_date = any(re.search(pattern, content_lower) for pattern in date_patterns)
+    if not has_date:
+        return False
+
+    # Final check: if we have at least one main CFP indicator, accept
+    # Otherwise, only accept if no non-CFP indicators were found
+    return main_cfp_count >= 1 or non_cfp_count == 0
 
 
 def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> EnrichmentResult:

From b8f120c1b8b09a328d2278a2191cb87e29d536a8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 23 Jan 2026 00:42:36 +0000
Subject: [PATCH 3/5] refactor: move cfp_link validation to Claude API in full
 mode

Instead of using brittle rule-based validation for CFP links in
deterministic mode, cfp_link extraction is now handled by Claude
in full mode where it can properly understand context.

Changes:
- Remove content_contains_cfp_info() function (rule-based validation)
- Remove cfp_link extraction from run_deterministic_extraction()
- Add cfp_link to fields extracted by Claude in build_enrichment_prompt()
- Update prompt to instruct Claude to validate cfp_link is main CFP page

This ensures cfp_link is only added when AI can confirm it's the main
CFP submission page, not sponsors/volunteers/specialist tracks/etc.
---
 utils/enrich_tba.py | 162 ++++----------------------------------------
 1 file changed, 14 insertions(+), 148 deletions(-)

diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py
index 2550724a58..2d2d2a0145 100644
--- a/utils/enrich_tba.py
+++ b/utils/enrich_tba.py
@@ -382,135 +382,11 @@ def extract_links_from_url(url: str) -> dict[str, str]:
     return found
 
 
-def content_contains_cfp_info(content: str) -> bool:
-    """Check if content contains main CFP (Call for Papers/Proposals) information.
-
-    This validates that a potential CFP link leads to a page about submitting
-    talks/papers/proposals, NOT other "Call for X" pages like sponsors, volunteers,
-    specialist tracks, etc.
-
-    Parameters
-    ----------
-    content : str
-        Page content to check
-
-    Returns
-    -------
-    bool
-        True if content appears to be about main CFP submissions
-    """
-    content_lower = content.lower()
-
-    # Negative indicators - if these dominate the page, it's NOT a main CFP
-    # These are "Call for X" pages that aren't about talk/paper submissions
-    non_cfp_indicators = [
-        "call for sponsors",
-        "call for volunteers",
-        "call for reviewers",
-        "call for mentors",
-        "call for organizers",
-        "call for committee",
-        "become a sponsor",
-        "sponsorship opportunities",
-        "sponsorship package",
-        "volunteer sign",
-        "volunteer registration",
-        "reviewer application",
-        "program committee",
-        "organizing committee",
-        "specialist track",  # Sub-track CFPs, not main CFP
-        "special interest",
-        "birds of a feather",
-        "bof session",
-    ]
-
-    # Count non-CFP indicators
-    non_cfp_count = sum(1 for indicator in non_cfp_indicators if indicator in content_lower)
-
-    # Positive indicators - these suggest it's about main talk/paper submissions
-    main_cfp_indicators = [
-        "call for papers",
-        "call for proposals",
-        "call for presentations",
-        "call for talks",
-        "call for speakers",
-        "submit a talk",
-        "submit a paper",
-        "submit a proposal",
-        "submit your talk",
-        "submit your paper",
-        "submit your proposal",
-        "submission deadline",
-        "proposal deadline",
-        "paper deadline",
-        "talk deadline",
-        "abstract deadline",
-        "cfp deadline",
-        "speaker submission",
-        "talk submission",
-        "paper submission",
-        "proposal submission",
-    ]
-
-    # Count main CFP indicators
-    main_cfp_count = sum(1 for indicator in main_cfp_indicators if indicator in content_lower)
-
-    # If non-CFP indicators dominate and no strong main CFP indicators, reject
-    if non_cfp_count > 0 and main_cfp_count == 0:
-        return False
-
-    # If strong main CFP indicators are present, accept even if some non-CFP indicators exist
-    if main_cfp_count >= 2:
-        return True
-
-    # For borderline cases, require deadline-related keywords
-    deadline_keywords = [
-        "deadline",
-        "submit",
-        "submission",
-        "due",
-        "closes",
-        "close",
-        "call for",
-        "cfp",
-        "proposal",
-        "abstract",
-    ]
-
-    has_deadline_keyword = any(kw in content_lower for kw in deadline_keywords)
-    if not has_deadline_keyword:
-        return False
-
-    # Must contain date-like patterns
-    date_patterns = [
-        # Month names (English)
-        r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\b",
-        # Month abbreviations
-        r"\b(?:jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)[.\s]",
-        # ISO-like dates: YYYY-MM-DD or DD-MM-YYYY
-        r"\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b",
-        r"\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b",
-        # Day Month Year: "15 January 2026" or "January 15, 2026"
-        r"\b\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{4}\b",
-        r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{4}\b",
-        # Year patterns near deadline words (e.g., "2026" near "deadline")
-        r"\b202[4-9]\b",
-    ]
-
-    has_date = any(re.search(pattern, content_lower) for pattern in date_patterns)
-    if not has_date:
-        return False
-
-    # Final check: if we have at least one main CFP indicator, accept
-    # Otherwise, only accept if no non-CFP indicators were found
-    return main_cfp_count >= 1 or non_cfp_count == 0
-
-
 def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> EnrichmentResult:
     """Run deterministic link extraction on all conferences.
 
-    This extracts social media links, sponsor/finaid pages, and validated CFP links
-    without using AI - purely pattern matching and content validation.
+    This extracts social media links and sponsor/finaid pages without using AI -
+    purely pattern matching. CFP links are validated by Claude in full mode.
 
     Parameters
     ----------
@@ -536,24 +412,7 @@ def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> Enrichmen
         logger.debug(f"Deterministic extraction for {name} {year}")
         extracted = extract_links_from_url(url)
 
-        # Also try to find and validate CFP link
-        # (can be external domain like Pretalx, SessionizeApp, etc.)
-        if "cfp_link" not in extracted:
-            cfp_links = find_cfp_links(url)
-            for cfp_url in cfp_links:
-                # Skip if same as main URL
-                if cfp_url == url:
-                    continue
-
-                logger.debug(f"  Validating CFP link: {cfp_url}")
-                cfp_content = prefetch_website(cfp_url)
-
-                if cfp_content and not cfp_content.startswith("Error"):
-                    if content_contains_cfp_info(cfp_content):
-                        extracted["cfp_link"] = cfp_url
-                        logger.debug(f"  Validated cfp_link: {cfp_url}")
-                        break
-                    logger.debug(f"  Skipped (no CFP info): {cfp_url}")
+        # Note: cfp_link is validated by Claude in full mode, not here
 
         if extracted:
             # Create ConferenceUpdate with deterministic fields
@@ -829,10 +688,11 @@ def build_enrichment_prompt(
     conferences: list[dict[str, Any]],
     content_map: dict[str, str],
 ) -> str:
-    """Build the Claude API prompt for date/timezone extraction.
+    """Build the Claude API prompt for CFP data extraction.
 
-    Note: URL fields (bluesky, mastodon, sponsor, finaid, cfp_link) are handled
-    deterministically and don't need AI extraction.
+    Note: URL fields (bluesky, mastodon, sponsor, finaid) are handled
+    deterministically. cfp_link requires AI validation to ensure it's
+    the main CFP page and not sponsors/volunteers/etc.
 
     Parameters
     ----------
@@ -848,6 +708,7 @@ def build_enrichment_prompt(
     """
     fields_to_extract = [
         "cfp",
+        "cfp_link",
         "workshop_deadline",
         "tutorial_deadline",
         "timezone",
@@ -855,6 +716,10 @@ def build_enrichment_prompt(
     field_instructions = """
 Extract the following fields if found:
 - cfp: MAIN CFP deadline for talks/papers/proposals (MUST be format 'YYYY-MM-DD HH:mm:ss', use 23:59:00 if no time)
+- cfp_link: URL to the MAIN CFP submission page (where speakers submit talks/papers/proposals)
+  - MUST be the main CFP page, NOT: sponsors, volunteers, specialist tracks, reviewers, financial aid
+  - Can be external (Pretalx, Sessionize, etc.) or on the conference website
+  - Only include if you're confident it's the main speaker/paper submission page
 - workshop_deadline: Workshop submission deadline (MUST be format 'YYYY-MM-DD HH:mm:ss')
 - tutorial_deadline: Tutorial submission deadline (MUST be format 'YYYY-MM-DD HH:mm:ss')
 - timezone: Conference timezone (MUST be IANA format with slash, e.g., 'America/Chicago', 'Europe/Berlin')
@@ -862,9 +727,10 @@ def build_enrichment_prompt(
 
 CRITICAL RULES:
 - cfp MUST be the MAIN Call for Papers/Proposals deadline (talks, papers, presentations)
-- cfp MUST NOT be: sponsors, volunteers, specialist tracks, financial aid, grants, reviewers
+- cfp and cfp_link MUST NOT be: sponsors, volunteers, specialist tracks, financial aid, grants, reviewers
 - If only "Call for Sponsors/Volunteers/Tracks" found, set status to "not_announced"
 - Date fields: MUST be exactly 'YYYY-MM-DD HH:mm:ss' format
+- URL fields: MUST be absolute URLs starting with https:// or http://
 - Timezone: MUST be IANA format with slash (America/New_York), NEVER abbreviations (EST, CEST)
 - Leave field EMPTY if not found on the page
 

From 5c4213cdfd6e528c835c111d29e3fda9a6bc315c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 23 Jan 2026 00:47:30 +0000
Subject: [PATCH 4/5] fix: validate cfp_link implicitly when Claude finds CFP
 deadline

Instead of asking Claude to extract cfp_link URLs (which it can't
reliably do from content), we now:

1. Find potential cfp_link URLs via pattern matching (deterministic)
2. Fetch content from those URLs
3. Send combined content to Claude for CFP deadline extraction
4. If Claude finds a valid CFP deadline, the cfp_link is validated

This means cfp_link is only added when:
- We found a potential CFP URL via pattern matching
- We successfully fetched its content
- Claude confirmed there's a main CFP in that content

The cfp_link tracks which URL was fetched so we can add it after
Claude validates the content contains CFP info.
---
 utils/enrich_tba.py | 50 +++++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py
index 2d2d2a0145..5c0ec87841 100644
--- a/utils/enrich_tba.py
+++ b/utils/enrich_tba.py
@@ -621,10 +621,13 @@ def find_cfp_links(url: str) -> list[str]:
     return cfp_links[:3]  # Limit to 3 CFP pages max
 
 
-def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:
+def prefetch_websites(
+    conferences: list[dict[str, Any]],
+) -> tuple[dict[str, str], dict[str, str]]:
     """Pre-fetch website content for multiple conferences.
 
-    Also fetches CFP-related subpages if found on the main page.
+    Also fetches CFP-related subpages if found on the main page, and tracks
+    which cfp_link was used for each conference (for validation).
 
     Parameters
     ----------
@@ -633,11 +636,14 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:
 
     Returns
     -------
-    dict[str, str]
-        Dict mapping conference key (name_year) to website content
+    tuple[dict[str, str], dict[str, str]]
+        Tuple of (content_map, cfp_link_map):
+        - content_map: Dict mapping conference key (name_year) to website content
+        - cfp_link_map: Dict mapping conference key to the cfp_link URL that was fetched
     """
     logger = get_logger()
     content_map: dict[str, str] = {}
+    cfp_link_map: dict[str, str] = {}
 
     for conf in conferences:
         name = conf.get("conference", "Unknown")
@@ -670,18 +676,28 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:
             if link not in cfp_links_to_fetch and link != url:
                 cfp_links_to_fetch.append(link)
 
+        # Track the first valid cfp_link we fetch (for later validation)
+        fetched_cfp_link = None
+
         # Fetch CFP subpages (limit to 2 to avoid too much content)
         for cfp_url in cfp_links_to_fetch[:2]:
             logger.debug(f"  Also fetching CFP page: {cfp_url}")
             cfp_content = prefetch_website(cfp_url)
             if cfp_content and not cfp_content.startswith("Error"):
                 additional_content.append(f"\n\n--- CFP Page ({cfp_url}) ---\n{cfp_content}")
+                # Track the first successfully fetched cfp_link
+                if fetched_cfp_link is None:
+                    fetched_cfp_link = cfp_url
+
+        # Store the cfp_link that was fetched (if any)
+        if fetched_cfp_link:
+            cfp_link_map[key] = fetched_cfp_link
 
         # Combine all content
         combined = main_content + "".join(additional_content)
         content_map[key] = combined[:MAX_CONTENT_LENGTH]
 
-    return content_map
+    return content_map, cfp_link_map
 
 
 def build_enrichment_prompt(
@@ -691,8 +707,8 @@ def build_enrichment_prompt(
     """Build the Claude API prompt for CFP data extraction.
 
     Note: URL fields (bluesky, mastodon, sponsor, finaid) are handled
-    deterministically. cfp_link requires AI validation to ensure it's
-    the main CFP page and not sponsors/volunteers/etc.
+    deterministically. cfp_link is validated implicitly - if Claude finds
+    a CFP deadline in content that includes cfp_link content, that link is valid.
 
     Parameters
     ----------
@@ -708,7 +724,6 @@ def build_enrichment_prompt(
     """
     fields_to_extract = [
         "cfp",
-        "cfp_link",
         "workshop_deadline",
         "tutorial_deadline",
         "timezone",
@@ -716,10 +731,6 @@ def build_enrichment_prompt(
     field_instructions = """
 Extract the following fields if found:
 - cfp: MAIN CFP deadline for talks/papers/proposals (MUST be format 'YYYY-MM-DD HH:mm:ss', use 23:59:00 if no time)
-- cfp_link: URL to the MAIN CFP submission page (where speakers submit talks/papers/proposals)
-  - MUST be the main CFP page, NOT: sponsors, volunteers, specialist tracks, reviewers, financial aid
-  - Can be external (Pretalx, Sessionize, etc.) or on the conference website
-  - Only include if you're confident it's the main speaker/paper submission page
 - workshop_deadline: Workshop submission deadline (MUST be format 'YYYY-MM-DD HH:mm:ss')
 - tutorial_deadline: Tutorial submission deadline (MUST be format 'YYYY-MM-DD HH:mm:ss')
 - timezone: Conference timezone (MUST be IANA format with slash, e.g., 'America/Chicago', 'Europe/Berlin')
@@ -727,10 +738,9 @@ def build_enrichment_prompt(
 
 CRITICAL RULES:
 - cfp MUST be the MAIN Call for Papers/Proposals deadline (talks, papers, presentations)
-- cfp and cfp_link MUST NOT be: sponsors, volunteers, specialist tracks, financial aid, grants, reviewers
+- cfp MUST NOT be: sponsors, volunteers, specialist tracks, financial aid, grants, reviewers
 - If only "Call for Sponsors/Volunteers/Tracks" found, set status to "not_announced"
 - Date fields: MUST be exactly 'YYYY-MM-DD HH:mm:ss' format
-- URL fields: MUST be absolute URLs starting with https:// or http://
 - Timezone: MUST be IANA format with slash (America/New_York), NEVER abbreviations (EST, CEST)
 - Leave field EMPTY if not found on the page
 
@@ -1156,7 +1166,7 @@ def enrich_tba_conferences(
 
         # Pre-fetch websites for AI processing
         logger.info("Pre-fetching conference websites...")
-        content_map = prefetch_websites(tba_conferences)
+        content_map, cfp_link_map = prefetch_websites(tba_conferences)
 
         # Build and send prompt to Claude for date/timezone extraction
         logger.info("Calling Claude API for date/timezone extraction...")
@@ -1185,6 +1195,16 @@ def enrich_tba_conferences(
             f"{ai_result.summary.get('not_announced', 0)} not announced",
         )
 
+        # Add cfp_link to conferences where Claude found a CFP deadline
+        # This validates the cfp_link - if Claude found CFP info in the content
+        # that includes the cfp_link page, that link is valid
+        for conf_update in ai_result.conferences:
+            key = f"{conf_update.conference}_{conf_update.year}"
+            if "cfp" in conf_update.fields and key in cfp_link_map:
+                cfp_link_url = cfp_link_map[key]
+                conf_update.fields["cfp_link"] = FieldUpdate(value=cfp_link_url, confidence=1.0)
+                logger.info(f"  {key}: validated cfp_link {cfp_link_url}")
+
         # Merge deterministic and AI results
         logger.info("Merging deterministic and AI results...")
         result = merge_enrichment_results(deterministic_result, ai_result)

From 3fc106dc0e308d48d9fab5b24fcf8c864760b6e8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 23 Jan 2026 00:52:16 +0000
Subject: [PATCH 5/5] fix: use printf to build multi-line string in workflow

The multi-line string with === headers was confusing the YAML parser.
Using printf avoids the YAML parsing issue while producing the same output.
---
 .github/workflows/scheduled-extension-checker.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/scheduled-extension-checker.yml b/.github/workflows/scheduled-extension-checker.yml
index c92e0fc782..2f42db9cb3 100644
--- a/.github/workflows/scheduled-extension-checker.yml
+++ b/.github/workflows/scheduled-extension-checker.yml
@@ -189,14 +189,13 @@ jobs:
             CFP_CONTENT=$(lynx -dump -nolist "$CONF_CFP_LINK" 2>/dev/null | head -500 || echo "Failed to fetch CFP page")
           fi
 
-          # Combine content for analysis
-          COMBINED_CONTENT="=== EXTENSION CHECK: $CONF_REASON ===
-
-=== MAIN WEBSITE ($CONF_URL) ===
-$MAIN_CONTENT
-
-=== CFP PAGE ($CONF_CFP_LINK) ===
-$CFP_CONTENT"
+          # Combine content for analysis (use printf to avoid YAML parsing issues)
+          COMBINED_CONTENT=$(printf '%s\n\n%s\n%s\n\n%s\n%s' \
+            "=== EXTENSION CHECK: $CONF_REASON ===" \
+            "=== MAIN WEBSITE ($CONF_URL) ===" \
+            "$MAIN_CONTENT" \
+            "=== CFP PAGE ($CONF_CFP_LINK) ===" \
+            "$CFP_CONTENT")
 
           # Truncate if too long (GitHub has payload limits)
           COMBINED_CONTENT=$(echo "$COMBINED_CONTENT" | head -c 60000)