import re from difflib import SequenceMatcher from typing import List, Tuple from config import SYNC_CONFIG from .types import SubtitleEntry, WhisperSegment class TextMatcher: @staticmethod def normalize_text(text: str) -> str: return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip() @staticmethod def text_similarity(text1: str, text2: str) -> float: n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2) if not n1 or not n2: return 0.0 return SequenceMatcher(None, n1, n2).ratio() @staticmethod def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \ List[Tuple[SubtitleEntry, int, float]]: matches = [] window = SYNC_CONFIG['search_window_sec'] * 1000 scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000 # Optimization: Pre-filter subtitles relevant_subs = [ s for s in subtitles if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window) ] for w_seg in whisper_segments: abs_start = w_seg.start_ms + chunk_start_ms best_sub = None best_score = 0.0 for sub in relevant_subs: if not (abs_start - window <= sub.start_ms <= abs_start + window): continue if len(sub.raw_text) < 3: continue score = TextMatcher.text_similarity(sub.raw_text, w_seg.text) if score > best_score: best_score = score best_sub = sub if best_sub and best_score >= SYNC_CONFIG['min_match_score']: matches.append((best_sub, abs_start, best_score)) return matches