Skip to content

Latest commit

 

History

History
695 lines (550 loc) · 22.2 KB

File metadata and controls

695 lines (550 loc) · 22.2 KB

Locale-Aware Sorting in Python

Slide 1: Basic Locale-Aware Sorting with locale Module

The locale module provides basic functionality for locale-aware string sorting through the strcoll function, which compares two strings according to the current locale settings. This enables culturally appropriate sorting of text containing diacritical marks.

import locale

# Set locale to French
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

# List of French words with accents
french_words = ['élève', 'école', 'étude', 'être', 'écrire']

# Sort using locale-aware comparison
sorted_words = sorted(french_words, key=locale.strxfrm)
print(f"Sorted French words: {sorted_words}")

# Output:
# Sorted French words: ['école', 'écrire', 'élève', 'être', 'étude']

Slide 2: PyICU Implementation for Unicode Collation

The PyICU library implements the Unicode Collation Algorithm, providing robust multilingual text sorting capabilities. It handles complex sorting rules across different writing systems and cultural conventions more reliably than the locale module.

from icu import Collator, Locale

# Create a collator for Spanish locale
spanish_collator = Collator.createInstance(Locale('es'))

# List of Spanish words with special characters
spanish_words = ['ñandu', 'nata', 'niño', 'nuez', 'ñora']

# Sort using ICU collation
sorted_words = sorted(spanish_words, key=spanish_collator.getSortKey)
print(f"Sorted Spanish words: {sorted_words}")

# Output:
# Sorted Spanish words: ['nata', 'niño', 'nuez', 'ñandu', 'ñora']

Slide 3: Multi-Locale Sorting Implementation

This implementation demonstrates handling text from multiple locales simultaneously using PyICU's advanced features. The solution maintains separate collators for different languages while providing a unified sorting interface.

from icu import Collator, Locale
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class MultiLocaleText:
    text: str
    locale: str

class MultiLocaleSorter:
    def __init__(self):
        self.collators: Dict[str, Collator] = {}
    
    def get_collator(self, locale_str: str) -> Collator:
        if locale_str not in self.collators:
            self.collators[locale_str] = Collator.createInstance(Locale(locale_str))
        return self.collators[locale_str]
    
    def sort_texts(self, texts: List[MultiLocaleText]) -> List[MultiLocaleText]:
        return sorted(texts, key=lambda x: self.get_collator(x.locale).getSortKey(x.text))

# Example usage
texts = [
    MultiLocaleText("école", "fr"),
    MultiLocaleText("ñandu", "es"),
    MultiLocaleText("über", "de")
]

sorter = MultiLocaleSorter()
sorted_texts = sorter.sort_texts(texts)
for item in sorted_texts:
    print(f"{item.text} ({item.locale})")

Slide 4: Thread-Safe Locale Handling

The implementation addresses thread safety concerns when dealing with locale-aware sorting in multi-threaded applications. This approach uses thread-local storage to maintain separate locale settings for each thread.

import threading
import locale
from contextlib import contextmanager
from typing import Generator

class ThreadSafeLocale:
    _thread_local = threading.local()
    
    @classmethod
    @contextmanager
    def temporary_locale(cls, temp_locale: str) -> Generator[None, None, None]:
        # Store current locale
        old_locale = locale.getlocale()
        
        try:
            # Set temporary locale for this thread
            locale.setlocale(locale.LC_ALL, temp_locale)
            yield
        finally:
            # Restore original locale
            locale.setlocale(locale.LC_ALL, old_locale)
    
    @classmethod
    def sort_with_locale(cls, items: list, locale_name: str) -> list:
        with cls.temporary_locale(locale_name):
            return sorted(items, key=locale.strxfrm)

# Example usage in multiple threads
def sort_in_thread(words: list, locale_name: str) -> None:
    sorted_words = ThreadSafeLocale.sort_with_locale(words, locale_name)
    print(f"Thread {threading.current_thread().name}: {sorted_words}")

# Test with multiple threads
words = ['école', 'élève', 'étude']
threads = [
    threading.Thread(target=sort_in_thread, args=(words, 'fr_FR.UTF-8')),
    threading.Thread(target=sort_in_thread, args=(words, 'en_US.UTF-8'))
]

for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

Slide 5: Custom Collation Rules

Custom collation rules allow fine-grained control over sorting behavior beyond standard locale settings. This implementation demonstrates how to create and apply custom sorting rules for specialized scenarios.

from icu import Collator, Locale, RuleBasedCollator

# Define custom collation rules
# & is used to position characters relative to others
# < defines primary difference
# << defines secondary difference
custom_rules = """
    &c < č <<< Č 
    &s < š <<< Š 
    &z < ž <<< Ž
"""

def create_custom_collator():
    try:
        collator = RuleBasedCollator(custom_rules)
        collator.setStrength(Collator.SECONDARY)
        return collator
    except Exception as e:
        print(f"Error creating collator: {e}")
        return None

# Example usage
words = ['cap', 'čap', 'cup', 'Čap', 'sup', 'šup']
collator = create_custom_collator()

if collator:
    sorted_words = sorted(words, key=collator.getSortKey)
    print(f"Custom sorted words: {sorted_words}")
    
# Output:
# Custom sorted words: ['cap', 'čap', 'Čap', 'cup', 'sup', 'šup']

Slide 6: Performance Optimization for Large Datasets

When dealing with large text collections, performance optimization becomes crucial. This implementation uses caching and preprocessing to improve sorting efficiency for repeated operations.

from functools import lru_cache
from icu import Collator, Locale
import time
from typing import List, Tuple

class CachedCollationSorter:
    def __init__(self, locale_str: str):
        self.collator = Collator.createInstance(Locale(locale_str))
        self._sort_key_cache = {}
    
    @lru_cache(maxsize=10000)
    def get_sort_key(self, text: str) -> bytes:
        return self.collator.getSortKey(text)
    
    def sort_texts(self, texts: List[str]) -> Tuple[List[str], float]:
        start_time = time.time()
        sorted_texts = sorted(texts, key=self.get_sort_key)
        elapsed_time = time.time() - start_time
        return sorted_texts, elapsed_time

# Performance comparison example
def compare_sorting_performance(texts: List[str], locale_str: str):
    # Standard sorting
    standard_start = time.time()
    standard_sorted = sorted(texts)
    standard_time = time.time() - standard_start
    
    # Cached collation sorting
    sorter = CachedCollationSorter(locale_str)
    cached_sorted, cached_time = sorter.sort_texts(texts)
    
    return {
        'standard_time': standard_time,
        'cached_time': cached_time,
        'standard_count': len(standard_sorted),
        'cached_count': len(cached_sorted)
    }

# Example usage with a large dataset
large_text_list = ['é'+str(i) for i in range(10000)]
results = compare_sorting_performance(large_text_list, 'fr_FR.UTF-8')
print(f"Performance Results: {results}")

Slide 7: Handling Mixed Scripts and Writing Systems

This implementation addresses the challenge of sorting text containing multiple scripts or writing systems, such as mixed Latin, Cyrillic, and Chinese characters.

from icu import Collator, Locale, UCollAttribute, UCollAttributeValue
from typing import List, Dict

class MixedScriptSorter:
    def __init__(self):
        self.script_collators: Dict[str, Collator] = {
            'latin': self._create_collator('en'),
            'cyrillic': self._create_collator('ru'),
            'han': self._create_collator('zh')
        }
        
    def _create_collator(self, locale_str: str) -> Collator:
        collator = Collator.createInstance(Locale(locale_str))
        collator.setAttribute(
            UCollAttribute.ALTERNATE_HANDLING,
            UCollAttributeValue.SHIFTED
        )
        return collator
    
    def _detect_script(self, text: str) -> str:
        # Simplified script detection
        if any('\u0400' <= c <= '\u04FF' for c in text):
            return 'cyrillic'
        elif any('\u4e00' <= c <= '\u9fff' for c in text):
            return 'han'
        return 'latin'
    
    def sort_mixed_text(self, texts: List[str]) -> List[str]:
        return sorted(
            texts,
            key=lambda x: self.script_collators[self._detect_script(x)].getSortKey(x)
        )

# Example usage
mixed_texts = [
    'apple',      # Latin
    'яблоко',     # Cyrillic
    '苹果',       # Chinese
    'banana',     # Latin
    'банан',      # Cyrillic
    '香蕉'        # Chinese
]

sorter = MixedScriptSorter()
sorted_texts = sorter.sort_mixed_text(mixed_texts)
print("Sorted mixed scripts:", sorted_texts)

Slide 8: Real-world Application: Multilingual Address Book

This implementation demonstrates a practical address book system that handles contact names from multiple languages and provides efficient sorting and searching capabilities.

from dataclasses import dataclass
from typing import Dict, List
from icu import Collator, Locale
import json

@dataclass
class Contact:
    name: str
    locale: str
    phone: str
    email: str

class MultilingualAddressBook:
    def __init__(self):
        self.contacts: List[Contact] = []
        self.collators: Dict[str, Collator] = {}
    
    def add_contact(self, contact: Contact) -> None:
        self.contacts.append(contact)
        if contact.locale not in self.collators:
            self.collators[contact.locale] = Collator.createInstance(
                Locale(contact.locale)
            )
    
    def get_sorted_contacts(self) -> List[Contact]:
        return sorted(
            self.contacts,
            key=lambda x: self.collators[x.locale].getSortKey(x.name)
        )
    
    def export_to_json(self, filename: str) -> None:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(
                [vars(c) for c in self.contacts],
                f,
                ensure_ascii=False,
                indent=2
            )

# Example usage
address_book = MultilingualAddressBook()

# Add contacts with different locales
contacts = [
    Contact("José García", "es_ES", "+34123456789", "jose@email.com"),
    Contact("張偉", "zh_CN", "+861234567890", "wei@email.com"),
    Contact("André Martin", "fr_FR", "+33123456789", "andre@email.com")
]

for contact in contacts:
    address_book.add_contact(contact)

# Get sorted contacts
sorted_contacts = address_book.get_sorted_contacts()
for contact in sorted_contacts:
    print(f"{contact.name} ({contact.locale})")

# Export to JSON
address_book.export_to_json("contacts.json")

Slide 9: Handling Normalization in Text Sorting

Text normalization is crucial for correct sorting when dealing with decomposed Unicode characters and different representation forms. This implementation ensures consistent sorting regardless of character composition.

from unicodedata import normalize
from icu import Collator, Locale
from typing import List, Callable

class NormalizedTextSorter:
    def __init__(self, locale_str: str):
        self.collator = Collator.createInstance(Locale(locale_str))
        
    def _normalize_text(self, text: str, form: str = 'NFKC') -> str:
        return normalize(form, text)
    
    def sort_with_normalization(
        self,
        texts: List[str],
        normalization_form: str = 'NFKC'
    ) -> List[str]:
        normalized_pairs = [
            (text, self._normalize_text(text, normalization_form))
            for text in texts
        ]
        
        # Sort by normalized form but preserve original text
        sorted_pairs = sorted(
            normalized_pairs,
            key=lambda x: self.collator.getSortKey(x[1])
        )
        
        return [pair[0] for pair in sorted_pairs]

# Example usage with different Unicode representations
texts = [
    'café',      # composed
    'cafe\u0301',  # decomposed
    'über',      # composed
    'u\u0308ber'   # decomposed
]

sorter = NormalizedTextSorter('en_US.UTF-8')
sorted_texts = sorter.sort_with_normalization(texts)
print("Normalized and sorted texts:", sorted_texts)

# Test different normalization forms
for form in ['NFC', 'NFD', 'NFKC', 'NFKD']:
    sorted_texts = sorter.sort_with_normalization(texts, form)
    print(f"\nSorted with {form} normalization:", sorted_texts)

Slide 10: Fast Binary Search with Locale-Aware Comparison

This implementation optimizes searching in sorted multilingual lists by implementing a modified binary search that respects locale-specific ordering rules.

from icu import Collator, Locale
from typing import List, Optional
import time

class LocaleAwareBinarySearch:
    def __init__(self, locale_str: str):
        self.collator = Collator.createInstance(Locale(locale_str))
        
    def binary_search(self, sorted_list: List[str], target: str) -> Optional[int]:
        left, right = 0, len(sorted_list) - 1
        
        while left <= right:
            mid = (left + right) // 2
            comparison = self.collator.compare(sorted_list[mid], target)
            
            if comparison == 0:
                return mid
            elif comparison < 0:
                left = mid + 1
            else:
                right = mid - 1
                
        return None

    def insert_position(self, sorted_list: List[str], target: str) -> int:
        left, right = 0, len(sorted_list)
        
        while left < right:
            mid = (left + right) // 2
            if self.collator.compare(sorted_list[mid], target) <= 0:
                left = mid + 1
            else:
                right = mid
                
        return left

# Example usage
searcher = LocaleAwareBinarySearch('fr_FR.UTF-8')
words = ['àvoir', 'être', 'manger', 'étudier', 'écrire']
sorted_words = sorted(words, key=searcher.collator.getSortKey)

# Search for existing word
target = 'être'
result = searcher.binary_search(sorted_words, target)
print(f"Found '{target}' at index: {result}")

# Find insertion position for new word
new_word = 'élever'
pos = searcher.insert_position(sorted_words, new_word)
print(f"Insert '{new_word}' at position: {pos}")

Slide 11: Real-world Application: Multilingual Document Indexing

This implementation creates a locale-aware document indexing system suitable for managing multilingual content in a document management system.

from icu import Collator, Locale
from typing import Dict, List, Set
import re
from dataclasses import dataclass
from datetime import datetime

@dataclass
class Document:
    id: str
    title: str
    content: str
    locale: str
    created_at: datetime

class MultilingualDocumentIndex:
    def __init__(self):
        self.documents: Dict[str, Document] = {}
        self.collators: Dict[str, Collator] = {}
        self.index: Dict[str, Set[str]] = {}
        
    def _get_collator(self, locale_str: str) -> Collator:
        if locale_str not in self.collators:
            self.collators[locale_str] = Collator.createInstance(Locale(locale_str))
        return self.collators[locale_str]
    
    def _tokenize(self, text: str) -> List[str]:
        return re.findall(r'\w+', text.lower())
    
    def add_document(self, doc: Document) -> None:
        self.documents[doc.id] = doc
        
        # Index document tokens
        tokens = self._tokenize(f"{doc.title} {doc.content}")
        for token in tokens:
            if token not in self.index:
                self.index[token] = set()
            self.index[token].add(doc.id)
    
    def search(self, query: str, locale: str = None) -> List[Document]:
        tokens = self._tokenize(query)
        matching_docs = set.intersection(
            *[self.index.get(token, set()) for token in tokens]
        )
        
        results = [self.documents[doc_id] for doc_id in matching_docs]
        
        if locale:
            collator = self._get_collator(locale)
            results.sort(key=lambda x: collator.getSortKey(x.title))
            
        return results

# Example usage
index = MultilingualDocumentIndex()

# Add sample documents
docs = [
    Document("1", "L'éducation en France", "Système éducatif...", "fr_FR", 
             datetime.now()),
    Document("2", "La educación en España", "Sistema educativo...", "es_ES",
             datetime.now()),
    Document("3", "Education in England", "Educational system...", "en_GB",
             datetime.now())
]

for doc in docs:
    index.add_document(doc)

# Search and sort results by French locale
results = index.search("education", "fr_FR")
for doc in results:
    print(f"{doc.title} ({doc.locale})")

Slide 12: Advanced Sorting with Weight-based Rules

This implementation demonstrates how to create complex sorting rules with weighted priorities for different character attributes, useful for specialized sorting requirements in academic or technical applications.

from icu import Collator, Locale, UCollAttribute, UCollAttributeValue
from typing import List, Tuple, Dict
import re

class WeightedCollationSorter:
    def __init__(self, base_locale: str):
        self.collator = Collator.createInstance(Locale(base_locale))
        self.weights: Dict[str, int] = {}
        
    def set_weights(self, weights: Dict[str, int]) -> None:
        self.weights = weights
        
    def configure_collator(self, 
                         strength: int = Collator.TERTIARY,
                         case_level: bool = True) -> None:
        self.collator.setStrength(strength)
        self.collator.setAttribute(
            UCollAttribute.CASE_LEVEL,
            1 if case_level else 0
        )
        
    def get_weighted_key(self, text: str) -> Tuple[int, bytes]:
        weight = sum(self.weights.get(char, 0) for char in text)
        return (-weight, self.collator.getSortKey(text))
        
    def sort_texts(self, texts: List[str]) -> List[str]:
        return sorted(texts, key=self.get_weighted_key)

# Example usage
sorter = WeightedCollationSorter('en_US.UTF-8')

# Define custom weights for special characters
weights = {
    'α': 100,  # Greek alpha
    'β': 90,   # Greek beta
    'γ': 80,   # Greek gamma
    '∑': 70,   # Summation
    '∫': 60    # Integral
}

sorter.set_weights(weights)
sorter.configure_collator(strength=Collator.QUATERNARY)

# Test with mathematical expressions
expressions = [
    'f(x) = αx + β',
    '∫f(x)dx',
    'γ = 2π',
    '∑x_i',
    'β = 1/2'
]

sorted_expressions = sorter.sort_texts(expressions)
print("Sorted mathematical expressions:")
for expr in sorted_expressions:
    print(expr)

Slide 13: Results and Performance Analysis

This implementation provides comprehensive benchmarking and analysis capabilities for comparing different sorting approaches across various scenarios.

import time
import statistics
from typing import List, Dict, Any
from dataclasses import dataclass
from icu import Collator, Locale

@dataclass
class BenchmarkResult:
    method: str
    avg_time: float
    std_dev: float
    dataset_size: int
    locale: str

class SortingBenchmark:
    def __init__(self):
        self.results: List[BenchmarkResult] = []
        
    def benchmark_sort(self, 
                      method_name: str,
                      sort_func: callable,
                      dataset: List[str],
                      locale: str,
                      iterations: int = 5) -> BenchmarkResult:
        times: List[float] = []
        
        for _ in range(iterations):
            start_time = time.perf_counter()
            sort_func(dataset.copy())
            end_time = time.perf_counter()
            times.append(end_time - start_time)
            
        result = BenchmarkResult(
            method=method_name,
            avg_time=statistics.mean(times),
            std_dev=statistics.stdev(times),
            dataset_size=len(dataset),
            locale=locale
        )
        self.results.append(result)
        return result
    
    def generate_report(self) -> Dict[str, Any]:
        return {
            'summary': {
                'total_methods': len(self.results),
                'fastest_method': min(
                    self.results,
                    key=lambda x: x.avg_time
                ).method,
                'slowest_method': max(
                    self.results,
                    key=lambda x: x.avg_time
                ).method
            },
            'detailed_results': [
                {
                    'method': r.method,
                    'avg_time_ms': r.avg_time * 1000,
                    'std_dev_ms': r.std_dev * 1000,
                    'dataset_size': r.dataset_size,
                    'locale': r.locale
                }
                for r in self.results
            ]
        }

# Example usage
benchmark = SortingBenchmark()

# Test data
test_data = ['école', 'élève', 'être', 'étudier'] * 1000

# Benchmark different sorting methods
collator = Collator.createInstance(Locale('fr_FR.UTF-8'))

methods = {
    'python_default': lambda x: sorted(x),
    'icu_collation': lambda x: sorted(x, key=collator.getSortKey),
    'locale_strxfrm': lambda x: sorted(x, key=locale.strxfrm)
}

for name, func in methods.items():
    result = benchmark.benchmark_sort(name, func, test_data, 'fr_FR.UTF-8')
    print(f"\nMethod: {name}")
    print(f"Average time: {result.avg_time*1000:.2f}ms")
    print(f"Standard deviation: {result.std_dev*1000:.2f}ms")

report = benchmark.generate_report()
print("\nComplete Benchmark Report:")
print(report)

Slide 14: Additional Resources