Spaces:

CLEAR-Global
/

speech-resource-finder

Sleeping

speech-resource-finder / data_loaders.py

Alp

wiki search, huge refactor

5ea1cbe about 2 months ago

6.21 kB

	"""
	Data loading utilities for Speech Resource Finder
	"""

	import csv
	import json
	import requests


	def load_language_list(csv_path):
	"""
	Load ISO 639 language codes from CSV file

	Returns:
	dict: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
	"""
	languages = {}

	try:
	with open(csv_path, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in reader:
	iso_639_2 = row['ISO 639-2'].strip()
	iso_639_1 = row['ISO 639-1'].strip()
	name = row['English name'].strip()
	french_name = row['French name'].strip()

	if iso_639_2 and name:
	languages[iso_639_2] = {
	"name": name,
	"iso_639_1": iso_639_1,
	"french_name": french_name,
	}
	print(f"Loaded {len(languages)} languages from {csv_path}")
	except Exception as e:
	print(f"ERROR: Failed to load language list from {csv_path}: {e}")
	print("The application cannot run without the language codes CSV file.")

	return languages


	def load_language_taxonomy(taxonomy_url):
	"""
	Load language taxonomy data from Microsoft's linguistic diversity project

	Returns:
	dict: {language_name_lowercase: level}
	"""
	taxonomy = {}

	try:
	response = requests.get(taxonomy_url, timeout=10)
	response.raise_for_status()

	# Parse the CSV-like content (format: language_name,level)
	for line in response.text.strip().split('\n'):
	if line.strip():
	parts = line.strip().split(',')
	if len(parts) == 2:
	lang_name = parts[0].strip().lower()
	level = int(parts[1].strip())
	taxonomy[lang_name] = level

	print(f"Loaded taxonomy data for {len(taxonomy)} languages")
	except Exception as e:
	print(f"Warning: Could not load language taxonomy: {e}")
	print("Language classification will show as 'Unknown'")

	return taxonomy


	def load_common_voice_data(json_path):
	"""
	Load Common Voice dataset statistics

	Returns:
	dict: {locale_code: {validHrs, totalHrs, splits, ...}}
	"""
	cv_data = {}

	try:
	with open(json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	cv_data = data.get('locales', {})
	print(f"Loaded Common Voice data for {len(cv_data)} locales")
	except Exception as e:
	print(f"Warning: Could not load Common Voice data: {e}")
	print("Common Voice information will not be available")

	return cv_data


	def load_app_content(content_path):
	"""
	Load app content from markdown file

	Returns:
	dict: {"title": str, "description": str, "full_content": str}
	"""
	app_content = {
	"title": "Speech Resource Finder",
	"description": "Search for speech resources",
	"full_content": ""
	}

	try:
	with open(content_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Parse markdown content
	lines = content.split('\n')

	# Extract title (first # heading)
	title = "Speech Resource Finder"
	for line in lines:
	if line.startswith('# '):
	title = line[2:].strip()
	break

	# Extract description (text after ## Description until next ##)
	description = ""
	in_description = False
	for line in lines:
	if line.startswith('## Description'):
	in_description = True
	continue
	elif in_description and line.startswith('##'):
	break
	elif in_description and line.strip():
	description += line.strip() + " "

	app_content = {
	"title": title,
	"description": description.strip(),
	"full_content": content
	}
	print(f"Loaded app content from {content_path}")
	except Exception as e:
	print(f"Error loading app content: {e}")
	print("Using default content")

	return app_content


	def get_common_voice_stats(language_code, iso_639_1, cv_data):
	"""
	Get Common Voice statistics for a language

	Args:
	language_code: ISO 639-2 (3-letter) code
	iso_639_1: ISO 639-1 (2-letter) code
	cv_data: Common Voice dataset dictionary

	Returns:
	dict or None: Statistics if found, None otherwise
	"""
	# Try to find CV data using different code formats
	cv_locale = None
	locale_data = None

	# 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab")
	if language_code and language_code in cv_data:
	cv_locale = language_code
	locale_data = cv_data[language_code]
	# 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr")
	elif iso_639_1 and iso_639_1 in cv_data:
	cv_locale = iso_639_1
	locale_data = cv_data[iso_639_1]
	# 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE")
	elif iso_639_1:
	matching_locales = [loc for loc in cv_data.keys() if loc.startswith(iso_639_1 + '-')]
	if matching_locales:
	cv_locale = matching_locales[0] # Take the first match
	locale_data = cv_data[cv_locale]

	if not locale_data:
	return None

	# Extract statistics
	valid_hrs = locale_data.get('validHrs', 0)
	total_hrs = locale_data.get('totalHrs', 0)
	users = locale_data.get('users', 0)

	# Extract gender balance
	gender_splits = locale_data.get('splits', {}).get('gender', {})
	male_pct = gender_splits.get('male_masculine', 0) * 100
	female_pct = gender_splits.get('female_feminine', 0) * 100

	# Format users count
	if users >= 1000:
	users_formatted = f"{users / 1000:.0f}k"
	else:
	users_formatted = str(users)

	return {
	'locale': cv_locale,
	'valid_hrs': valid_hrs,
	'total_hrs': total_hrs,
	'male_pct': male_pct,
	'female_pct': female_pct,
	'users': users,
	'users_formatted': users_formatted
	}