diff --git a/tests/test_country_mappings.py b/tests/test_country_mappings.py new file mode 100644 index 0000000000..cfc6f9ee7e --- /dev/null +++ b/tests/test_country_mappings.py @@ -0,0 +1,246 @@ +"""Tests for the centralized country mappings module. + +This module tests that all country mapping functionality is consistent +and that the single source of truth (countries.py) works correctly. +""" + + +class TestCountryDisplayNames: + """Test that country codes map to correct display names.""" + + def test_us_stays_as_us(self): + """US should stay as 'US' in conference names.""" + from tidy_conf.countries import get_country_display_name + + assert get_country_display_name("US") == "US" + + def test_usa_becomes_us(self): + """USA should become 'US' in conference names.""" + from tidy_conf.countries import get_country_display_name + + assert get_country_display_name("USA") == "US" + + def test_uk_stays_as_uk(self): + """UK should stay as 'UK' in conference names.""" + from tidy_conf.countries import get_country_display_name + + assert get_country_display_name("UK") == "UK" + + def test_gb_becomes_uk(self): + """GB should become 'UK' in conference names.""" + from tidy_conf.countries import get_country_display_name + + assert get_country_display_name("GB") == "UK" + + def test_cz_expands_to_czechia(self): + """CZ should expand to 'Czechia'.""" + from tidy_conf.countries import get_country_display_name + + assert get_country_display_name("CZ") == "Czechia" + + def test_standard_codes_expand_via_iso(self): + """Standard ISO codes should expand to country names.""" + from tidy_conf.countries import get_country_display_name + + # These should use ISO 3166 lookup + assert get_country_display_name("DE") == "Germany" + assert get_country_display_name("FR") == "France" + assert get_country_display_name("PL") == "Poland" + assert get_country_display_name("JP") == "Japan" + + +class TestCountryNormalization: + """Test that country names normalize to canonical short forms.""" + + def test_normalize_us_variations(self): + """All US variations should normalize to 'US'.""" + from tidy_conf.countries import normalize_country_name + + assert normalize_country_name("United States") == "US" + assert normalize_country_name("United States of America") == "US" + assert normalize_country_name("USA") == "US" + assert normalize_country_name("US") == "US" + + def test_normalize_uk_variations(self): + """All UK variations should normalize to 'UK'.""" + from tidy_conf.countries import normalize_country_name + + assert normalize_country_name("United Kingdom") == "UK" + assert normalize_country_name("Great Britain") == "UK" + assert normalize_country_name("England") == "UK" + assert normalize_country_name("UK") == "UK" + assert normalize_country_name("GB") == "UK" + + def test_normalize_czechia_variations(self): + """Czech variations should normalize to 'Czechia'.""" + from tidy_conf.countries import normalize_country_name + + assert normalize_country_name("Czech Republic") == "Czechia" + assert normalize_country_name("Czechia") == "Czechia" + + def test_normalize_preserves_unknown(self): + """Unknown countries should be preserved.""" + from tidy_conf.countries import normalize_country_name + + assert normalize_country_name("Germany") == "Germany" + assert normalize_country_name("Japan") == "Japan" + + +class TestAlpha3Lookup: + """Test ISO 3166 alpha-3 code lookup.""" + + def test_us_variations_return_usa_code(self): + """All US variations should return alpha-3 code 'USA'.""" + from tidy_conf.countries import get_country_alpha3 + + assert get_country_alpha3("US") == "USA" + assert get_country_alpha3("USA") == "USA" + assert get_country_alpha3("United States") == "USA" + assert get_country_alpha3("United States of America") == "USA" + + def test_uk_variations_return_gbr_code(self): + """All UK variations should return alpha-3 code 'GBR'.""" + from tidy_conf.countries import get_country_alpha3 + + assert get_country_alpha3("UK") == "GBR" + assert get_country_alpha3("United Kingdom") == "GBR" + assert get_country_alpha3("England") == "GBR" + + def test_standard_countries(self): + """Standard country names should return correct alpha-3 codes.""" + from tidy_conf.countries import get_country_alpha3 + + assert get_country_alpha3("Germany") == "DEU" + assert get_country_alpha3("France") == "FRA" + assert get_country_alpha3("Japan") == "JPN" + + def test_preserves_unknown_countries(self): + """Unknown countries should be preserved, not lost.""" + from tidy_conf.countries import get_country_alpha3 + + assert get_country_alpha3("Atlantis") == "Atlantis" + assert get_country_alpha3("Unknown Place") == "Unknown Place" + + def test_handles_empty_input(self): + """Empty input should return empty string.""" + from tidy_conf.countries import get_country_alpha3 + + assert get_country_alpha3("") == "" + assert get_country_alpha3(None) == "" + assert get_country_alpha3(" ") == "" + + +class TestConferenceNameExpansion: + """Test that conference names with country codes expand correctly.""" + + def test_pycon_us_stays_as_us(self): + """PyCon US should stay as 'PyCon US'.""" + from tidy_conf.titles import expand_country_codes + + result = expand_country_codes("PyCon US") + assert result == "PyCon US" + + def test_pycon_uk_stays_as_uk(self): + """PyCon UK should stay as 'PyCon UK'.""" + from tidy_conf.titles import expand_country_codes + + result = expand_country_codes("PyCon UK") + assert result == "PyCon UK" + + def test_pycon_pl_expands_correctly(self): + """PyCon PL should expand to 'PyCon Poland'.""" + from tidy_conf.titles import expand_country_codes + + result = expand_country_codes("PyCon PL") + assert result == "PyCon Poland" + + def test_pycon_de_expands_correctly(self): + """PyCon DE should expand to 'PyCon Germany'.""" + from tidy_conf.titles import expand_country_codes + + result = expand_country_codes("PyCon DE") + assert result == "PyCon Germany" + + def test_expansion_is_idempotent(self): + """Expanding twice should give the same result.""" + from tidy_conf.titles import expand_country_codes + + once = expand_country_codes("PyCon US") + twice = expand_country_codes(once) + assert once == twice + + +class TestCountryCodeToNameMapping: + """Test the COUNTRY_CODE_TO_NAME mapping is correctly built.""" + + def test_us_maps_to_us(self): + """COUNTRY_CODE_TO_NAME['US'] should be 'US'.""" + from tidy_conf.countries import COUNTRY_CODE_TO_NAME + + assert COUNTRY_CODE_TO_NAME["US"] == "US" + + def test_uk_maps_to_uk(self): + """COUNTRY_CODE_TO_NAME['UK'] should be 'UK'.""" + from tidy_conf.countries import COUNTRY_CODE_TO_NAME + + assert COUNTRY_CODE_TO_NAME["UK"] == "UK" + + def test_usa_maps_to_us(self): + """COUNTRY_CODE_TO_NAME['USA'] should be 'US'.""" + from tidy_conf.countries import COUNTRY_CODE_TO_NAME + + assert COUNTRY_CODE_TO_NAME["USA"] == "US" + + def test_de_maps_to_germany(self): + """COUNTRY_CODE_TO_NAME['DE'] should be 'Germany'.""" + from tidy_conf.countries import COUNTRY_CODE_TO_NAME + + assert COUNTRY_CODE_TO_NAME["DE"] == "Germany" + + +class TestMergeReplacementsConsistency: + """Test that the merge replacements use the centralized mappings.""" + + def test_interactive_merge_uses_country_normalization(self): + """interactive_merge.py should use COUNTRY_NORMALIZATION.""" + from tidy_conf.countries import COUNTRY_NORMALIZATION + + # These should all be in COUNTRY_NORMALIZATION + assert "United States of America" in COUNTRY_NORMALIZATION + assert "United Kingdom" in COUNTRY_NORMALIZATION + assert "Czech Republic" in COUNTRY_NORMALIZATION + + # And map to the correct canonical forms + assert COUNTRY_NORMALIZATION["United States of America"] == "US" + assert COUNTRY_NORMALIZATION["United Kingdom"] == "UK" + assert COUNTRY_NORMALIZATION["Czech Republic"] == "Czechia" + + +class TestRegressionUSExpansion: + """Regression tests to prevent US -> United States of America bug.""" + + def test_normalize_conference_name_us_stays_as_us(self): + """normalize_conference_name should keep US as 'US'.""" + from tidy_conf.titles import normalize_conference_name + + result = normalize_conference_name("PyCon US") + assert result == "PyCon US" + assert "United States" not in result + + def test_pycon_de_expands_to_germany(self): + """PyCon DE should expand to 'PyCon Germany'.""" + from tidy_conf.titles import normalize_conference_name + + result = normalize_conference_name("PyCon DE") + assert result == "PyCon Germany" + + def test_place_with_us_not_expanded(self): + """A place ending with 'US' or 'USA' should normalize correctly.""" + from tidy_conf.countries import normalize_country_name + + # Direct normalization + assert normalize_country_name("USA") == "US" + + # After normalization, it should stay as "US" + result = normalize_country_name("US") + assert result == "US" diff --git a/utils/import_python_organizers.py b/utils/import_python_organizers.py index d0592cea0b..cc1f4ca0ff 100644 --- a/utils/import_python_organizers.py +++ b/utils/import_python_organizers.py @@ -5,7 +5,6 @@ from urllib import error as urllib_error # Third-party -import iso3166 import pandas as pd # Local imports @@ -13,6 +12,7 @@ from tidy_conf import fuzzy_match from tidy_conf import load_conferences from tidy_conf import merge_conferences + from tidy_conf.countries import get_country_alpha3 from tidy_conf.deduplicate import deduplicate from tidy_conf.schema import get_schema from tidy_conf.titles import normalize_conference_name @@ -23,6 +23,7 @@ from .tidy_conf import fuzzy_match from .tidy_conf import load_conferences from .tidy_conf import merge_conferences + from .tidy_conf.countries import get_country_alpha3 from .tidy_conf.deduplicate import deduplicate from .tidy_conf.schema import get_schema from .tidy_conf.titles import normalize_conference_name @@ -31,88 +32,6 @@ from .tidy_conf.yaml import write_df_yaml -# Common country name variations that map to iso3166 official names -# The iso3166 library uses full official names as keys (e.g., "UNITED STATES OF AMERICA") -# This mapping handles common short names and variations -COUNTRY_NAME_ALIASES = { - # United States variations - "USA": "UNITED STATES OF AMERICA", - "US": "UNITED STATES OF AMERICA", - "UNITED STATES": "UNITED STATES OF AMERICA", - # United Kingdom variations - "UK": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "UNITED KINGDOM": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "GREAT BRITAIN": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "BRITAIN": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "ENGLAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "SCOTLAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "WALES": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - "NORTHERN IRELAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", - # Other common variations - "CZECHIA": "CZECHIA", # iso3166 now uses CZECHIA - "CZECH REPUBLIC": "CZECHIA", - "KOREA": "KOREA, REPUBLIC OF", - "SOUTH KOREA": "KOREA, REPUBLIC OF", - "RUSSIA": "RUSSIAN FEDERATION", - "VIETNAM": "VIET NAM", - "TAIWAN": "TAIWAN, PROVINCE OF CHINA", - "IRAN": "IRAN, ISLAMIC REPUBLIC OF", - "SYRIA": "SYRIAN ARAB REPUBLIC", - "BOLIVIA": "BOLIVIA, PLURINATIONAL STATE OF", - "VENEZUELA": "VENEZUELA, BOLIVARIAN REPUBLIC OF", - "TANZANIA": "TANZANIA, UNITED REPUBLIC OF", - "MOLDOVA": "MOLDOVA, REPUBLIC OF", - "LAOS": "LAO PEOPLE'S DEMOCRATIC REPUBLIC", - "PALESTINE": "PALESTINE, STATE OF", - "THE NETHERLANDS": "NETHERLANDS", - "HOLLAND": "NETHERLANDS", -} - - -def get_country_alpha3(country_name: str) -> str: - """Get ISO 3166-1 alpha-3 country code from a country name. - - This function performs robust country code lookup with fallbacks: - 1. Direct lookup in iso3166.countries_by_name - 2. Lookup using common country name aliases - 3. If all lookups fail, returns the original country name to preserve data - - Parameters - ---------- - country_name : str - The country name to look up (e.g., "United States", "USA", "Germany") - - Returns - ------- - str - ISO 3166-1 alpha-3 code if found (e.g., "USA", "DEU"), - otherwise returns the original country name to preserve data - """ - if not country_name or not isinstance(country_name, str): - return "" - - name_upper = country_name.strip().upper() - - if not name_upper: - return "" - - # Try direct lookup first - country = iso3166.countries_by_name.get(name_upper) - if country: - return country.alpha3 - - # Try lookup using common aliases - if name_upper in COUNTRY_NAME_ALIASES: - aliased_name = COUNTRY_NAME_ALIASES[name_upper] - country = iso3166.countries_by_name.get(aliased_name) - if country: - return country.alpha3 - - # Fallback: return original country name to preserve data - # This ensures we don't silently lose country information - return country_name.strip() - - def load_remote(year: int) -> pd.DataFrame: """Load conference data from GitHub CSV for a specific year. diff --git a/utils/tidy_conf/countries.py b/utils/tidy_conf/countries.py new file mode 100644 index 0000000000..4200b46a26 --- /dev/null +++ b/utils/tidy_conf/countries.py @@ -0,0 +1,271 @@ +"""Centralized country mappings for conference data processing. + +This module is the SINGLE SOURCE OF TRUTH for all country-related mappings. +All other modules should import from here to ensure consistency. + +Design decisions: +- We use SHORT forms (US, UK) as the canonical display format for places +- ISO 3166 alpha-3 codes (USA, GBR) are used for CSV Country column +- Conference names expand 2-letter codes to full names (PyCon US -> PyCon United States) +""" + +import iso3166 + +# ============================================================================= +# CANONICAL SHORT NAMES +# ============================================================================= +# These are the preferred short names used in the 'place' field +# e.g., "San Francisco, US" or "London, UK" + +CANONICAL_COUNTRY_NAMES = { + "US": "US", # United States - always use "US" + "UK": "UK", # United Kingdom - always use "UK" + "Czechia": "Czechia", # Not "Czech Republic" +} + +# ============================================================================= +# DISPLAY NAME MAPPINGS (for conference names) +# ============================================================================= +# Maps country codes and variations to display names for conference titles +# Some codes stay as-is (US, UK), others expand to full names (DE -> Germany) + +COUNTRY_DISPLAY_NAMES = { + # These stay as short codes in conference names (e.g., "PyCon US" stays "PyCon US") + "US": "US", + "USA": "US", + "UK": "UK", + "GB": "UK", + # These expand to full names (e.g., "PyCon DE" -> "PyCon Germany") + "CZ": "Czechia", + "NZ": "New Zealand", + "KR": "South Korea", + "ZA": "South Africa", +} + +# ============================================================================= +# NORMALIZATION MAPPINGS +# ============================================================================= +# Maps various country name formats to canonical short form +# Used when normalizing place fields during merge operations + +COUNTRY_NORMALIZATION = { + # US variations -> US + "United States": "US", + "United States of America": "US", + "USA": "US", + # UK variations -> UK + "United Kingdom": "UK", + "United Kingdom of Great Britain and Northern Ireland": "UK", + "Great Britain": "UK", + "Britain": "UK", + "England": "UK", + "Scotland": "UK", + "Wales": "UK", + "GB": "UK", + # Czechia variations + "Czech Republic": "Czechia", + # Korea variations + "Korea": "South Korea", + "Korea, Republic of": "South Korea", +} + +# ============================================================================= +# ISO 3166 ALPHA-3 LOOKUP ALIASES +# ============================================================================= +# Maps common country names to ISO 3166 official names for alpha-3 lookup +# The iso3166 library requires exact official names + +ISO_COUNTRY_ALIASES = { + # United States variations + "USA": "UNITED STATES OF AMERICA", + "US": "UNITED STATES OF AMERICA", + "UNITED STATES": "UNITED STATES OF AMERICA", + # United Kingdom variations + "UK": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "UNITED KINGDOM": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "GREAT BRITAIN": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "BRITAIN": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "ENGLAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "SCOTLAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "WALES": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + "NORTHERN IRELAND": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND", + # Other common variations + "CZECHIA": "CZECHIA", + "CZECH REPUBLIC": "CZECHIA", + "KOREA": "KOREA, REPUBLIC OF", + "SOUTH KOREA": "KOREA, REPUBLIC OF", + "RUSSIA": "RUSSIAN FEDERATION", + "VIETNAM": "VIET NAM", + "TAIWAN": "TAIWAN, PROVINCE OF CHINA", + "IRAN": "IRAN, ISLAMIC REPUBLIC OF", + "SYRIA": "SYRIAN ARAB REPUBLIC", + "BOLIVIA": "BOLIVIA, PLURINATIONAL STATE OF", + "VENEZUELA": "VENEZUELA, BOLIVARIAN REPUBLIC OF", + "TANZANIA": "TANZANIA, UNITED REPUBLIC OF", + "MOLDOVA": "MOLDOVA, REPUBLIC OF", + "LAOS": "LAO PEOPLE'S DEMOCRATIC REPUBLIC", + "PALESTINE": "PALESTINE, STATE OF", + "THE NETHERLANDS": "NETHERLANDS", + "HOLLAND": "NETHERLANDS", +} + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + + +def normalize_country_name(country: str) -> str: + """Normalize a country name to its canonical short form. + + Parameters + ---------- + country : str + Country name to normalize (e.g., "United States of America", "USA", "US") + + Returns + ------- + str + Canonical short form (e.g., "US", "UK", "Czechia") + """ + if not country or not isinstance(country, str): + return country if isinstance(country, str) else "" + + country = country.strip() + + # Check if it's already canonical + if country in CANONICAL_COUNTRY_NAMES: + return country + + # Try normalization mapping + if country in COUNTRY_NORMALIZATION: + return COUNTRY_NORMALIZATION[country] + + # Try case-insensitive normalization + country_upper = country.upper() + for key, value in COUNTRY_NORMALIZATION.items(): + if key.upper() == country_upper: + return value + + return country + + +def get_country_display_name(code: str) -> str: + """Get the full display name for a country code. + + Used for expanding country codes in conference names. + + Parameters + ---------- + code : str + Country code (e.g., "US", "UK", "DE") + + Returns + ------- + str + Full display name (e.g., "United States", "United Kingdom", "Germany") + """ + if not code or not isinstance(code, str): + return code if isinstance(code, str) else "" + + code = code.strip() + + # Check custom display names first + if code in COUNTRY_DISPLAY_NAMES: + return COUNTRY_DISPLAY_NAMES[code] + + # Fall back to ISO 3166 lookup + try: + country = iso3166.countries.get(code) + if country: + name = country.name + # Handle comma-separated names like "Korea, Republic of" + if "," in name: + return name.split(",")[0] + return name + except (KeyError, AttributeError): + pass + + return code + + +def get_country_alpha3(country_name: str) -> str: + """Get ISO 3166-1 alpha-3 country code from a country name. + + Parameters + ---------- + country_name : str + The country name to look up (e.g., "United States", "USA", "Germany") + + Returns + ------- + str + ISO 3166-1 alpha-3 code if found (e.g., "USA", "DEU"), + otherwise returns the original country name to preserve data + """ + if not country_name or not isinstance(country_name, str): + return "" + + name_upper = country_name.strip().upper() + + if not name_upper: + return "" + + # Try direct lookup first + country = iso3166.countries_by_name.get(name_upper) + if country: + return country.alpha3 + + # Try lookup using aliases + if name_upper in ISO_COUNTRY_ALIASES: + aliased_name = ISO_COUNTRY_ALIASES[name_upper] + country = iso3166.countries_by_name.get(aliased_name) + if country: + return country.alpha3 + + # Fallback: return original country name to preserve data + return country_name.strip() + + +# ============================================================================= +# BUILD COUNTRY CODE MAPPINGS +# ============================================================================= +# These dictionaries are used by titles.py for conference name expansion + +COUNTRY_CODE_TO_NAME = {} +COUNTRY_NAME_TO_CODE = {} + + +def _build_country_mappings(): + """Build the country code mappings from ISO 3166 and custom overrides.""" + global COUNTRY_CODE_TO_NAME, COUNTRY_NAME_TO_CODE + + # First, load ISO 3166 country codes + for country in iso3166.countries: + code = country.alpha2 + name = country.name + # Handle common name variations (e.g., "Korea, Republic of" -> "Korea") + if "," in name: + short_name = name.split(",")[0] + COUNTRY_CODE_TO_NAME[code] = short_name + COUNTRY_NAME_TO_CODE[short_name] = code + else: + COUNTRY_CODE_TO_NAME[code] = name + COUNTRY_NAME_TO_CODE[name] = code + + # Apply custom overrides from COUNTRY_DISPLAY_NAMES + # This ensures codes like "US" map to "United States" not "United States of America" + for code, name in COUNTRY_DISPLAY_NAMES.items(): + COUNTRY_CODE_TO_NAME[code] = name + if name not in COUNTRY_NAME_TO_CODE: + COUNTRY_NAME_TO_CODE[name] = code + + # Also add entries for common variations pointing to canonical codes + for variation, canonical in COUNTRY_NORMALIZATION.items(): + if variation not in COUNTRY_CODE_TO_NAME and canonical in COUNTRY_CODE_TO_NAME: + # Map variations to their display names via canonical form + COUNTRY_CODE_TO_NAME[variation] = COUNTRY_CODE_TO_NAME[canonical] + + +# Build mappings on module import +_build_country_mappings() diff --git a/utils/tidy_conf/interactive_merge.py b/utils/tidy_conf/interactive_merge.py index 5d3aedd181..124b9c7bf0 100644 --- a/utils/tidy_conf/interactive_merge.py +++ b/utils/tidy_conf/interactive_merge.py @@ -16,6 +16,7 @@ from thefuzz import process try: + from tidy_conf.countries import COUNTRY_NORMALIZATION from tidy_conf.schema import get_schema from tidy_conf.titles import tidy_df_names from tidy_conf.utils import query_yes_no @@ -27,6 +28,7 @@ from tidy_conf.yaml import load_title_mappings from tidy_conf.yaml import update_title_mappings except ImportError: + from .countries import COUNTRY_NORMALIZATION from .schema import get_schema from .titles import tidy_df_names from .utils import query_yes_no @@ -449,11 +451,9 @@ def merge_conferences( logger.debug("Dropping 'conference' column from df_remote") df_remote = df_remote.drop(["conference"], axis=1) - replacements = { - "United States of America": "USA", - "United Kingdom": "UK", - "Czech Republic": "Czechia", - } + # Use centralized country normalization mappings + # This ensures consistency with the rest of the codebase + replacements = COUNTRY_NORMALIZATION logger.info("Performing pandas merge on 'title_match'") df_merge = pd.merge( diff --git a/utils/tidy_conf/titles.py b/utils/tidy_conf/titles.py index 5c74fe5453..b5137611e4 100644 --- a/utils/tidy_conf/titles.py +++ b/utils/tidy_conf/titles.py @@ -1,49 +1,10 @@ import re -from iso3166 import countries +# Import centralized country mappings - this is the SINGLE SOURCE OF TRUTH +from tidy_conf.countries import COUNTRY_CODE_TO_NAME from tidy_conf.yaml import load_title_mappings from tqdm import tqdm -# Build country code mappings (both directions) -# e.g., "PL" -> "Poland", "Poland" -> "PL" -COUNTRY_CODE_TO_NAME = {} -COUNTRY_NAME_TO_CODE = {} - -# Custom mappings for common variations used in conference names -CUSTOM_COUNTRY_MAPPINGS = { - "United States": "US", - "United States of America": "US", - "USA": "US", - "UK": "United Kingdom", - "GB": "United Kingdom", - "CZ": "Czechia", - "Czech Republic": "Czechia", - "NZ": "New Zealand", - "KR": "South Korea", - "Korea": "South Korea", - "ZA": "South Africa", -} - -# Load ISO 3166 country codes -for country in countries: - code = country.alpha2 - name = country.name - # Handle common name variations - if "," in name: - # e.g., "Korea, Republic of" -> "Korea" - short_name = name.split(",")[0] - COUNTRY_CODE_TO_NAME[code] = short_name - COUNTRY_NAME_TO_CODE[short_name] = code - else: - COUNTRY_CODE_TO_NAME[code] = name - COUNTRY_NAME_TO_CODE[name] = code - -# Apply custom overrides -for code, name in CUSTOM_COUNTRY_MAPPINGS.items(): - COUNTRY_CODE_TO_NAME[code] = name - if name not in COUNTRY_NAME_TO_CODE: - COUNTRY_NAME_TO_CODE[name] = code - def tidy_titles(data): """Tidy up conference titles by replacing misspellings and alternative names."""