Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions .github/workflows/scheduled-extension-checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,11 @@ jobs:
print(f"Skip: {name} - already extended, past deadline", file=sys.stderr)
continue

cfp_link = str(conf.get("cfp_link", ""))
closing_soon.append({
"conference": name,
"url": url,
"cfp_link": cfp_link,
"cfp_date": str(cfp_date),
"has_extension": has_extension,
"check_type": check_type,
Expand Down Expand Up @@ -151,37 +153,67 @@ jobs:
fail-fast: false

steps:
- name: Trigger extension check
- name: Install lynx
run: sudo apt-get update && sudo apt-get install -y lynx

- name: Fetch website content and trigger check
env:
# Pass matrix values via env for safety
CONF_NAME: ${{ matrix.conference.conference }}
CONF_URL: ${{ matrix.conference.url }}
CONF_CFP_LINK: ${{ matrix.conference.cfp_link }}
CONF_REASON: ${{ matrix.conference.reason }}
CONF_CFP_DATE: ${{ matrix.conference.cfp_date }}
CONF_CHECK_TYPE: ${{ matrix.conference.check_type }}
CONF_HAS_EXT: ${{ matrix.conference.has_extension }}
GH_TOKEN: ${{ github.token }}
run: |
echo "Triggering check for: $CONF_NAME"
echo "URL: $CONF_URL"
echo "Main URL: $CONF_URL"
echo "CFP Link: $CONF_CFP_LINK"
echo "Reason: $CONF_REASON"
echo "CFP Date: $CONF_CFP_DATE"
echo "Check Type: $CONF_CHECK_TYPE"

# Fetch main website content
echo "Fetching main website content..."
MAIN_CONTENT=""
if [ -n "$CONF_URL" ]; then
MAIN_CONTENT=$(lynx -dump -nolist "$CONF_URL" 2>/dev/null | head -500 || echo "Failed to fetch main website")
fi

# Fetch CFP link content
echo "Fetching CFP link content..."
CFP_CONTENT=""
if [ -n "$CONF_CFP_LINK" ] && [ "$CONF_CFP_LINK" != "$CONF_URL" ]; then
CFP_CONTENT=$(lynx -dump -nolist "$CONF_CFP_LINK" 2>/dev/null | head -500 || echo "Failed to fetch CFP page")
fi

# Combine content for analysis (use printf to avoid YAML parsing issues)
COMBINED_CONTENT=$(printf '%s\n\n%s\n%s\n\n%s\n%s' \
"=== EXTENSION CHECK: $CONF_REASON ===" \
"=== MAIN WEBSITE ($CONF_URL) ===" \
"$MAIN_CONTENT" \
"=== CFP PAGE ($CONF_CFP_LINK) ===" \
"$CFP_CONTENT")

# Truncate if too long (GitHub has payload limits)
COMBINED_CONTENT=$(echo "$COMBINED_CONTENT" | head -c 60000)

# Use gh CLI for safe API call
# Pass trigger context so triage workflow can adjust prompt accordingly
gh api repos/${{ github.repository }}/dispatches \
-f event_type=conference-change \
-f "client_payload[url]=$CONF_URL" \
-f "client_payload[title]=$CONF_NAME" \
-f "client_payload[watch_uuid]=" \
-f "client_payload[diff]=Scheduled extension check: $CONF_REASON" \
-f "client_payload[diff]=$COMBINED_CONTENT" \
-f "client_payload[source]=scheduled-checker" \
-f "client_payload[trigger_reason]=$CONF_CHECK_TYPE" \
-f "client_payload[original_cfp_deadline]=$CONF_CFP_DATE" \
-f "client_payload[has_extension]=$CONF_HAS_EXT"

echo "✓ Triggered"
echo "✓ Triggered with website content"

- name: Rate limit pause
run: sleep 10 # Generous pause between triggers
Expand Down
124 changes: 39 additions & 85 deletions utils/enrich_tba.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,67 +382,11 @@ def extract_links_from_url(url: str) -> dict[str, str]:
return found


def content_contains_cfp_info(content: str) -> bool:
"""Check if content contains main CFP (Call for Papers/Proposals) information.

This validates that a potential CFP link leads to a page about submitting
talks/papers/proposals, NOT other "Call for X" pages like sponsors, volunteers,
specialist tracks, etc.

Parameters
----------
content : str
Page content to check

Returns
-------
bool
True if content appears to be about main CFP submissions
"""
content_lower = content.lower()

# Must contain deadline-related keywords
deadline_keywords = [
"deadline",
"submit",
"submission",
"due",
"closes",
"close",
"call for",
"cfp",
"proposal",
"abstract",
]

has_deadline_keyword = any(kw in content_lower for kw in deadline_keywords)
if not has_deadline_keyword:
return False

# Must contain date-like patterns
date_patterns = [
# Month names (English)
r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\b",
# Month abbreviations
r"\b(?:jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)[.\s]",
# ISO-like dates: YYYY-MM-DD or DD-MM-YYYY
r"\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b",
r"\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b",
# Day Month Year: "15 January 2026" or "January 15, 2026"
r"\b\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{4}\b",
r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{4}\b",
# Year patterns near deadline words (e.g., "2026" near "deadline")
r"\b202[4-9]\b",
]

return any(re.search(pattern, content_lower) for pattern in date_patterns)


def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> EnrichmentResult:
"""Run deterministic link extraction on all conferences.

This extracts social media links, sponsor/finaid pages, and validated CFP links
without using AI - purely pattern matching and content validation.
This extracts social media links and sponsor/finaid pages without using AI -
purely pattern matching. CFP links are validated by Claude in full mode.

Parameters
----------
Expand All @@ -468,24 +412,7 @@ def run_deterministic_extraction(conferences: list[dict[str, Any]]) -> Enrichmen
logger.debug(f"Deterministic extraction for {name} {year}")
extracted = extract_links_from_url(url)

# Also try to find and validate CFP link
# (can be external domain like Pretalx, SessionizeApp, etc.)
if "cfp_link" not in extracted:
cfp_links = find_cfp_links(url)
for cfp_url in cfp_links:
# Skip if same as main URL
if cfp_url == url:
continue

logger.debug(f" Validating CFP link: {cfp_url}")
cfp_content = prefetch_website(cfp_url)

if cfp_content and not cfp_content.startswith("Error"):
if content_contains_cfp_info(cfp_content):
extracted["cfp_link"] = cfp_url
logger.debug(f" Validated cfp_link: {cfp_url}")
break
logger.debug(f" Skipped (no CFP info): {cfp_url}")
# Note: cfp_link is validated by Claude in full mode, not here

if extracted:
# Create ConferenceUpdate with deterministic fields
Expand Down Expand Up @@ -694,10 +621,13 @@ def find_cfp_links(url: str) -> list[str]:
return cfp_links[:3] # Limit to 3 CFP pages max


def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:
def prefetch_websites(
conferences: list[dict[str, Any]],
) -> tuple[dict[str, str], dict[str, str]]:
"""Pre-fetch website content for multiple conferences.

Also fetches CFP-related subpages if found on the main page.
Also fetches CFP-related subpages if found on the main page, and tracks
which cfp_link was used for each conference (for validation).

Parameters
----------
Expand All @@ -706,11 +636,14 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:

Returns
-------
dict[str, str]
Dict mapping conference key (name_year) to website content
tuple[dict[str, str], dict[str, str]]
Tuple of (content_map, cfp_link_map):
- content_map: Dict mapping conference key (name_year) to website content
- cfp_link_map: Dict mapping conference key to the cfp_link URL that was fetched
"""
logger = get_logger()
content_map: dict[str, str] = {}
cfp_link_map: dict[str, str] = {}

for conf in conferences:
name = conf.get("conference", "Unknown")
Expand Down Expand Up @@ -743,28 +676,39 @@ def prefetch_websites(conferences: list[dict[str, Any]]) -> dict[str, str]:
if link not in cfp_links_to_fetch and link != url:
cfp_links_to_fetch.append(link)

# Track the first valid cfp_link we fetch (for later validation)
fetched_cfp_link = None

# Fetch CFP subpages (limit to 2 to avoid too much content)
for cfp_url in cfp_links_to_fetch[:2]:
logger.debug(f" Also fetching CFP page: {cfp_url}")
cfp_content = prefetch_website(cfp_url)
if cfp_content and not cfp_content.startswith("Error"):
additional_content.append(f"\n\n--- CFP Page ({cfp_url}) ---\n{cfp_content}")
# Track the first successfully fetched cfp_link
if fetched_cfp_link is None:
fetched_cfp_link = cfp_url

# Store the cfp_link that was fetched (if any)
if fetched_cfp_link:
cfp_link_map[key] = fetched_cfp_link

# Combine all content
combined = main_content + "".join(additional_content)
content_map[key] = combined[:MAX_CONTENT_LENGTH]

return content_map
return content_map, cfp_link_map


def build_enrichment_prompt(
conferences: list[dict[str, Any]],
content_map: dict[str, str],
) -> str:
"""Build the Claude API prompt for date/timezone extraction.
"""Build the Claude API prompt for CFP data extraction.

Note: URL fields (bluesky, mastodon, sponsor, finaid, cfp_link) are handled
deterministically and don't need AI extraction.
Note: URL fields (bluesky, mastodon, sponsor, finaid) are handled
deterministically. cfp_link is validated implicitly - if Claude finds
a CFP deadline in content that includes cfp_link content, that link is valid.

Parameters
----------
Expand Down Expand Up @@ -1222,7 +1166,7 @@ def enrich_tba_conferences(

# Pre-fetch websites for AI processing
logger.info("Pre-fetching conference websites...")
content_map = prefetch_websites(tba_conferences)
content_map, cfp_link_map = prefetch_websites(tba_conferences)

# Build and send prompt to Claude for date/timezone extraction
logger.info("Calling Claude API for date/timezone extraction...")
Expand Down Expand Up @@ -1251,6 +1195,16 @@ def enrich_tba_conferences(
f"{ai_result.summary.get('not_announced', 0)} not announced",
)

# Add cfp_link to conferences where Claude found a CFP deadline
# This validates the cfp_link - if Claude found CFP info in the content
# that includes the cfp_link page, that link is valid
for conf_update in ai_result.conferences:
key = f"{conf_update.conference}_{conf_update.year}"
if "cfp" in conf_update.fields and key in cfp_link_map:
cfp_link_url = cfp_link_map[key]
conf_update.fields["cfp_link"] = FieldUpdate(value=cfp_link_url, confidence=1.0)
logger.info(f" {key}: validated cfp_link {cfp_link_url}")

# Merge deterministic and AI results
logger.info("Merging deterministic and AI results...")
result = merge_enrichment_results(deterministic_result, ai_result)
Expand Down