speech-resource-finder / data_loaders.py
Alp
wiki search, huge refactor
5ea1cbe
"""
Data loading utilities for Speech Resource Finder
"""
import csv
import json
import requests
def load_language_list(csv_path):
"""
Load ISO 639 language codes from CSV file
Returns:
dict: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
"""
languages = {}
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
iso_639_2 = row['ISO 639-2'].strip()
iso_639_1 = row['ISO 639-1'].strip()
name = row['English name'].strip()
french_name = row['French name'].strip()
if iso_639_2 and name:
languages[iso_639_2] = {
"name": name,
"iso_639_1": iso_639_1,
"french_name": french_name,
}
print(f"Loaded {len(languages)} languages from {csv_path}")
except Exception as e:
print(f"ERROR: Failed to load language list from {csv_path}: {e}")
print("The application cannot run without the language codes CSV file.")
return languages
def load_language_taxonomy(taxonomy_url):
"""
Load language taxonomy data from Microsoft's linguistic diversity project
Returns:
dict: {language_name_lowercase: level}
"""
taxonomy = {}
try:
response = requests.get(taxonomy_url, timeout=10)
response.raise_for_status()
# Parse the CSV-like content (format: language_name,level)
for line in response.text.strip().split('\n'):
if line.strip():
parts = line.strip().split(',')
if len(parts) == 2:
lang_name = parts[0].strip().lower()
level = int(parts[1].strip())
taxonomy[lang_name] = level
print(f"Loaded taxonomy data for {len(taxonomy)} languages")
except Exception as e:
print(f"Warning: Could not load language taxonomy: {e}")
print("Language classification will show as 'Unknown'")
return taxonomy
def load_common_voice_data(json_path):
"""
Load Common Voice dataset statistics
Returns:
dict: {locale_code: {validHrs, totalHrs, splits, ...}}
"""
cv_data = {}
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
cv_data = data.get('locales', {})
print(f"Loaded Common Voice data for {len(cv_data)} locales")
except Exception as e:
print(f"Warning: Could not load Common Voice data: {e}")
print("Common Voice information will not be available")
return cv_data
def load_app_content(content_path):
"""
Load app content from markdown file
Returns:
dict: {"title": str, "description": str, "full_content": str}
"""
app_content = {
"title": "Speech Resource Finder",
"description": "Search for speech resources",
"full_content": ""
}
try:
with open(content_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse markdown content
lines = content.split('\n')
# Extract title (first # heading)
title = "Speech Resource Finder"
for line in lines:
if line.startswith('# '):
title = line[2:].strip()
break
# Extract description (text after ## Description until next ##)
description = ""
in_description = False
for line in lines:
if line.startswith('## Description'):
in_description = True
continue
elif in_description and line.startswith('##'):
break
elif in_description and line.strip():
description += line.strip() + " "
app_content = {
"title": title,
"description": description.strip(),
"full_content": content
}
print(f"Loaded app content from {content_path}")
except Exception as e:
print(f"Error loading app content: {e}")
print("Using default content")
return app_content
def get_common_voice_stats(language_code, iso_639_1, cv_data):
"""
Get Common Voice statistics for a language
Args:
language_code: ISO 639-2 (3-letter) code
iso_639_1: ISO 639-1 (2-letter) code
cv_data: Common Voice dataset dictionary
Returns:
dict or None: Statistics if found, None otherwise
"""
# Try to find CV data using different code formats
cv_locale = None
locale_data = None
# 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab")
if language_code and language_code in cv_data:
cv_locale = language_code
locale_data = cv_data[language_code]
# 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr")
elif iso_639_1 and iso_639_1 in cv_data:
cv_locale = iso_639_1
locale_data = cv_data[iso_639_1]
# 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE")
elif iso_639_1:
matching_locales = [loc for loc in cv_data.keys() if loc.startswith(iso_639_1 + '-')]
if matching_locales:
cv_locale = matching_locales[0] # Take the first match
locale_data = cv_data[cv_locale]
if not locale_data:
return None
# Extract statistics
valid_hrs = locale_data.get('validHrs', 0)
total_hrs = locale_data.get('totalHrs', 0)
users = locale_data.get('users', 0)
# Extract gender balance
gender_splits = locale_data.get('splits', {}).get('gender', {})
male_pct = gender_splits.get('male_masculine', 0) * 100
female_pct = gender_splits.get('female_feminine', 0) * 100
# Format users count
if users >= 1000:
users_formatted = f"{users / 1000:.0f}k"
else:
users_formatted = str(users)
return {
'locale': cv_locale,
'valid_hrs': valid_hrs,
'total_hrs': total_hrs,
'male_pct': male_pct,
'female_pct': female_pct,
'users': users,
'users_formatted': users_formatted
}