Spaces:
Sleeping
Sleeping
| """ | |
| Data loading utilities for Speech Resource Finder | |
| """ | |
| import csv | |
| import json | |
| import requests | |
| def load_language_list(csv_path): | |
| """ | |
| Load ISO 639 language codes from CSV file | |
| Returns: | |
| dict: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}} | |
| """ | |
| languages = {} | |
| try: | |
| with open(csv_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| iso_639_2 = row['ISO 639-2'].strip() | |
| iso_639_1 = row['ISO 639-1'].strip() | |
| name = row['English name'].strip() | |
| french_name = row['French name'].strip() | |
| if iso_639_2 and name: | |
| languages[iso_639_2] = { | |
| "name": name, | |
| "iso_639_1": iso_639_1, | |
| "french_name": french_name, | |
| } | |
| print(f"Loaded {len(languages)} languages from {csv_path}") | |
| except Exception as e: | |
| print(f"ERROR: Failed to load language list from {csv_path}: {e}") | |
| print("The application cannot run without the language codes CSV file.") | |
| return languages | |
| def load_language_taxonomy(taxonomy_url): | |
| """ | |
| Load language taxonomy data from Microsoft's linguistic diversity project | |
| Returns: | |
| dict: {language_name_lowercase: level} | |
| """ | |
| taxonomy = {} | |
| try: | |
| response = requests.get(taxonomy_url, timeout=10) | |
| response.raise_for_status() | |
| # Parse the CSV-like content (format: language_name,level) | |
| for line in response.text.strip().split('\n'): | |
| if line.strip(): | |
| parts = line.strip().split(',') | |
| if len(parts) == 2: | |
| lang_name = parts[0].strip().lower() | |
| level = int(parts[1].strip()) | |
| taxonomy[lang_name] = level | |
| print(f"Loaded taxonomy data for {len(taxonomy)} languages") | |
| except Exception as e: | |
| print(f"Warning: Could not load language taxonomy: {e}") | |
| print("Language classification will show as 'Unknown'") | |
| return taxonomy | |
| def load_common_voice_data(json_path): | |
| """ | |
| Load Common Voice dataset statistics | |
| Returns: | |
| dict: {locale_code: {validHrs, totalHrs, splits, ...}} | |
| """ | |
| cv_data = {} | |
| try: | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| cv_data = data.get('locales', {}) | |
| print(f"Loaded Common Voice data for {len(cv_data)} locales") | |
| except Exception as e: | |
| print(f"Warning: Could not load Common Voice data: {e}") | |
| print("Common Voice information will not be available") | |
| return cv_data | |
| def load_app_content(content_path): | |
| """ | |
| Load app content from markdown file | |
| Returns: | |
| dict: {"title": str, "description": str, "full_content": str} | |
| """ | |
| app_content = { | |
| "title": "Speech Resource Finder", | |
| "description": "Search for speech resources", | |
| "full_content": "" | |
| } | |
| try: | |
| with open(content_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Parse markdown content | |
| lines = content.split('\n') | |
| # Extract title (first # heading) | |
| title = "Speech Resource Finder" | |
| for line in lines: | |
| if line.startswith('# '): | |
| title = line[2:].strip() | |
| break | |
| # Extract description (text after ## Description until next ##) | |
| description = "" | |
| in_description = False | |
| for line in lines: | |
| if line.startswith('## Description'): | |
| in_description = True | |
| continue | |
| elif in_description and line.startswith('##'): | |
| break | |
| elif in_description and line.strip(): | |
| description += line.strip() + " " | |
| app_content = { | |
| "title": title, | |
| "description": description.strip(), | |
| "full_content": content | |
| } | |
| print(f"Loaded app content from {content_path}") | |
| except Exception as e: | |
| print(f"Error loading app content: {e}") | |
| print("Using default content") | |
| return app_content | |
| def get_common_voice_stats(language_code, iso_639_1, cv_data): | |
| """ | |
| Get Common Voice statistics for a language | |
| Args: | |
| language_code: ISO 639-2 (3-letter) code | |
| iso_639_1: ISO 639-1 (2-letter) code | |
| cv_data: Common Voice dataset dictionary | |
| Returns: | |
| dict or None: Statistics if found, None otherwise | |
| """ | |
| # Try to find CV data using different code formats | |
| cv_locale = None | |
| locale_data = None | |
| # 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab") | |
| if language_code and language_code in cv_data: | |
| cv_locale = language_code | |
| locale_data = cv_data[language_code] | |
| # 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr") | |
| elif iso_639_1 and iso_639_1 in cv_data: | |
| cv_locale = iso_639_1 | |
| locale_data = cv_data[iso_639_1] | |
| # 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE") | |
| elif iso_639_1: | |
| matching_locales = [loc for loc in cv_data.keys() if loc.startswith(iso_639_1 + '-')] | |
| if matching_locales: | |
| cv_locale = matching_locales[0] # Take the first match | |
| locale_data = cv_data[cv_locale] | |
| if not locale_data: | |
| return None | |
| # Extract statistics | |
| valid_hrs = locale_data.get('validHrs', 0) | |
| total_hrs = locale_data.get('totalHrs', 0) | |
| users = locale_data.get('users', 0) | |
| # Extract gender balance | |
| gender_splits = locale_data.get('splits', {}).get('gender', {}) | |
| male_pct = gender_splits.get('male_masculine', 0) * 100 | |
| female_pct = gender_splits.get('female_feminine', 0) * 100 | |
| # Format users count | |
| if users >= 1000: | |
| users_formatted = f"{users / 1000:.0f}k" | |
| else: | |
| users_formatted = str(users) | |
| return { | |
| 'locale': cv_locale, | |
| 'valid_hrs': valid_hrs, | |
| 'total_hrs': total_hrs, | |
| 'male_pct': male_pct, | |
| 'female_pct': female_pct, | |
| 'users': users, | |
| 'users_formatted': users_formatted | |
| } | |