diff --git a/.github/workflows/scheduled-extension-checker.yml b/.github/workflows/scheduled-extension-checker.yml index 77895f2b39..2f42db9cb3 100644 --- a/.github/workflows/scheduled-extension-checker.yml +++ b/.github/workflows/scheduled-extension-checker.yml @@ -100,9 +100,11 @@ jobs: print(f"Skip: {name} - already extended, past deadline", file=sys.stderr) continue + cfp_link = str(conf.get("cfp_link", "")) closing_soon.append({ "conference": name, "url": url, + "cfp_link": cfp_link, "cfp_date": str(cfp_date), "has_extension": has_extension, "check_type": check_type, @@ -151,11 +153,15 @@ jobs: fail-fast: false steps: - - name: Trigger extension check + - name: Install lynx + run: sudo apt-get update && sudo apt-get install -y lynx + + - name: Fetch website content and trigger check env: # Pass matrix values via env for safety CONF_NAME: ${{ matrix.conference.conference }} CONF_URL: ${{ matrix.conference.url }} + CONF_CFP_LINK: ${{ matrix.conference.cfp_link }} CONF_REASON: ${{ matrix.conference.reason }} CONF_CFP_DATE: ${{ matrix.conference.cfp_date }} CONF_CHECK_TYPE: ${{ matrix.conference.check_type }} @@ -163,11 +169,37 @@ jobs: GH_TOKEN: ${{ github.token }} run: | echo "Triggering check for: $CONF_NAME" - echo "URL: $CONF_URL" + echo "Main URL: $CONF_URL" + echo "CFP Link: $CONF_CFP_LINK" echo "Reason: $CONF_REASON" echo "CFP Date: $CONF_CFP_DATE" echo "Check Type: $CONF_CHECK_TYPE" + # Fetch main website content + echo "Fetching main website content..." + MAIN_CONTENT="" + if [ -n "$CONF_URL" ]; then + MAIN_CONTENT=$(lynx -dump -nolist "$CONF_URL" 2>/dev/null | head -500 || echo "Failed to fetch main website") + fi + + # Fetch CFP link content + echo "Fetching CFP link content..." + CFP_CONTENT="" + if [ -n "$CONF_CFP_LINK" ] && [ "$CONF_CFP_LINK" != "$CONF_URL" ]; then + CFP_CONTENT=$(lynx -dump -nolist "$CONF_CFP_LINK" 2>/dev/null | head -500 || echo "Failed to fetch CFP page") + fi + + # Combine content for analysis (use printf to avoid YAML parsing issues) + COMBINED_CONTENT=$(printf '%s\n\n%s\n%s\n\n%s\n%s' \ + "=== EXTENSION CHECK: $CONF_REASON ===" \ + "=== MAIN WEBSITE ($CONF_URL) ===" \ + "$MAIN_CONTENT" \ + "=== CFP PAGE ($CONF_CFP_LINK) ===" \ + "$CFP_CONTENT") + + # Truncate if too long (GitHub has payload limits) + COMBINED_CONTENT=$(echo "$COMBINED_CONTENT" | head -c 60000) + # Use gh CLI for safe API call # Pass trigger context so triage workflow can adjust prompt accordingly gh api repos/${{ github.repository }}/dispatches \ @@ -175,13 +207,13 @@ jobs: -f "client_payload[url]=$CONF_URL" \ -f "client_payload[title]=$CONF_NAME" \ -f "client_payload[watch_uuid]=" \ - -f "client_payload[diff]=Scheduled extension check: $CONF_REASON" \ + -f "client_payload[diff]=$COMBINED_CONTENT" \ -f "client_payload[source]=scheduled-checker" \ -f "client_payload[trigger_reason]=$CONF_CHECK_TYPE" \ -f "client_payload[original_cfp_deadline]=$CONF_CFP_DATE" \ -f "client_payload[has_extension]=$CONF_HAS_EXT" - echo "✓ Triggered" + echo "✓ Triggered with website content" - name: Rate limit pause run: sleep 10 # Generous pause between triggers diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index 42f14b645d..5c0ec87841 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -382,67 +382,11 @@ def extract_links_from_url(url: str) -> dict[str, str]: return found -def content_contains_cfp_info(content: str) -> bool: - """Check if content contains main CFP (Call for Papers/Proposals) information. - - This validates that a potential CFP link leads to a page about submitting - talks/papers/proposals, NOT other "Call for X" pages like sponsors, volunteers, - specialist tracks, etc. - - Parameters - ---------- - content : str - Page content to check - - Returns - ------- - bool - True if content appears to be about main CFP submissions - """ - content_lower = content.lower() - - # Must contain deadline-related keywords - deadline_keywords = [ - "deadline", - "submit", - "submission", - "due", - "closes", - "close", - "call for", - "cfp", - "proposal", - "abstract", - ] - - has_deadline_keyword = any(kw in content_lower for kw in deadline_keywords) - if not has_deadline_keyword: - return False - - # Must contain date-like patterns - date_patterns = [ - # Month names (English) - r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\b", - # Month abbreviations - r"\b(?:jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)[.\s]", - # ISO-like dates: YYYY-MM-DD or DD-MM-YYYY - r"\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b", - r"\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b", - # Day Month Year: "15 January 2026" or "January 15, 2026" - r"\b\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{4}\b", - r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{4}\b", - # Year patterns near deadline words (e.g., "2026" near "deadline") - r"\b202[4-9]\b", - ] - - return any(re.search(pattern, content_lower) for pattern in date_patterns) - - def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> EnrichmentResult: """Run deterministic link extraction on all conferences. - This extracts social media links, sponsor/finaid pages, and validated CFP links - without using AI - purely pattern matching and content validation. + This extracts social media links and sponsor/finaid pages without using AI - + purely pattern matching. CFP links are validated by Claude in full mode. Parameters ---------- @@ -468,24 +412,7 @@ def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> Enrichmen logger.debug(f"Deterministic extraction for {name} {year}") extracted = extract_links_from_url(url) - # Also try to find and validate CFP link - # (can be external domain like Pretalx, SessionizeApp, etc.) - if "cfp_link" not in extracted: - cfp_links = find_cfp_links(url) - for cfp_url in cfp_links: - # Skip if same as main URL - if cfp_url == url: - continue - - logger.debug(f" Validating CFP link: {cfp_url}") - cfp_content = prefetch_website(cfp_url) - - if cfp_content and not cfp_content.startswith("Error"): - if content_contains_cfp_info(cfp_content): - extracted["cfp_link"] = cfp_url - logger.debug(f" Validated cfp_link: {cfp_url}") - break - logger.debug(f" Skipped (no CFP info): {cfp_url}") + # Note: cfp_link is validated by Claude in full mode, not here if extracted: # Create ConferenceUpdate with deterministic fields @@ -694,10 +621,13 @@ def find_cfp_links(url: str) -> list[str]: return cfp_links[:3] # Limit to 3 CFP pages max -def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]: +def prefetch_websites( + conferences: list[dict[str, Any]], +) -> tuple[dict[str, str], dict[str, str]]: """Pre-fetch website content for multiple conferences. - Also fetches CFP-related subpages if found on the main page. + Also fetches CFP-related subpages if found on the main page, and tracks + which cfp_link was used for each conference (for validation). Parameters ---------- @@ -706,11 +636,14 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]: Returns ------- - dict[str, str] - Dict mapping conference key (name_year) to website content + tuple[dict[str, str], dict[str, str]] + Tuple of (content_map, cfp_link_map): + - content_map: Dict mapping conference key (name_year) to website content + - cfp_link_map: Dict mapping conference key to the cfp_link URL that was fetched """ logger = get_logger() content_map: dict[str, str] = {} + cfp_link_map: dict[str, str] = {} for conf in conferences: name = conf.get("conference", "Unknown") @@ -743,28 +676,39 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]: if link not in cfp_links_to_fetch and link != url: cfp_links_to_fetch.append(link) + # Track the first valid cfp_link we fetch (for later validation) + fetched_cfp_link = None + # Fetch CFP subpages (limit to 2 to avoid too much content) for cfp_url in cfp_links_to_fetch[:2]: logger.debug(f" Also fetching CFP page: {cfp_url}") cfp_content = prefetch_website(cfp_url) if cfp_content and not cfp_content.startswith("Error"): additional_content.append(f"\n\n--- CFP Page ({cfp_url}) ---\n{cfp_content}") + # Track the first successfully fetched cfp_link + if fetched_cfp_link is None: + fetched_cfp_link = cfp_url + + # Store the cfp_link that was fetched (if any) + if fetched_cfp_link: + cfp_link_map[key] = fetched_cfp_link # Combine all content combined = main_content + "".join(additional_content) content_map[key] = combined[:MAX_CONTENT_LENGTH] - return content_map + return content_map, cfp_link_map def build_enrichment_prompt( conferences: list[dict[str, Any]], content_map: dict[str, str], ) -> str: - """Build the Claude API prompt for date/timezone extraction. + """Build the Claude API prompt for CFP data extraction. - Note: URL fields (bluesky, mastodon, sponsor, finaid, cfp_link) are handled - deterministically and don't need AI extraction. + Note: URL fields (bluesky, mastodon, sponsor, finaid) are handled + deterministically. cfp_link is validated implicitly - if Claude finds + a CFP deadline in content that includes cfp_link content, that link is valid. Parameters ---------- @@ -1222,7 +1166,7 @@ def enrich_tba_conferences( # Pre-fetch websites for AI processing logger.info("Pre-fetching conference websites...") - content_map = prefetch_websites(tba_conferences) + content_map, cfp_link_map = prefetch_websites(tba_conferences) # Build and send prompt to Claude for date/timezone extraction logger.info("Calling Claude API for date/timezone extraction...") @@ -1251,6 +1195,16 @@ def enrich_tba_conferences( f"{ai_result.summary.get('not_announced', 0)} not announced", ) + # Add cfp_link to conferences where Claude found a CFP deadline + # This validates the cfp_link - if Claude found CFP info in the content + # that includes the cfp_link page, that link is valid + for conf_update in ai_result.conferences: + key = f"{conf_update.conference}_{conf_update.year}" + if "cfp" in conf_update.fields and key in cfp_link_map: + cfp_link_url = cfp_link_map[key] + conf_update.fields["cfp_link"] = FieldUpdate(value=cfp_link_url, confidence=1.0) + logger.info(f" {key}: validated cfp_link {cfp_link_url}") + # Merge deterministic and AI results logger.info("Merging deterministic and AI results...") result = merge_enrichment_results(deterministic_result, ai_result)