50 lines
1.8 KiB
Python
50 lines
1.8 KiB
Python
import re
|
|
from difflib import SequenceMatcher
|
|
from typing import List, Tuple
|
|
|
|
from config import SYNC_CONFIG
|
|
from .types import SubtitleEntry, WhisperSegment
|
|
|
|
|
|
class TextMatcher:
|
|
@staticmethod
|
|
def normalize_text(text: str) -> str:
|
|
return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip()
|
|
|
|
@staticmethod
|
|
def text_similarity(text1: str, text2: str) -> float:
|
|
n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2)
|
|
if not n1 or not n2: return 0.0
|
|
return SequenceMatcher(None, n1, n2).ratio()
|
|
|
|
@staticmethod
|
|
def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \
|
|
List[Tuple[SubtitleEntry, int, float]]:
|
|
matches = []
|
|
window = SYNC_CONFIG['search_window_sec'] * 1000
|
|
scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000
|
|
|
|
# Optimization: Pre-filter subtitles
|
|
relevant_subs = [
|
|
s for s in subtitles
|
|
if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window)
|
|
]
|
|
|
|
for w_seg in whisper_segments:
|
|
abs_start = w_seg.start_ms + chunk_start_ms
|
|
best_sub = None
|
|
best_score = 0.0
|
|
|
|
for sub in relevant_subs:
|
|
if not (abs_start - window <= sub.start_ms <= abs_start + window): continue
|
|
if len(sub.raw_text) < 3: continue
|
|
|
|
score = TextMatcher.text_similarity(sub.raw_text, w_seg.text)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_sub = sub
|
|
|
|
if best_sub and best_score >= SYNC_CONFIG['min_match_score']:
|
|
matches.append((best_sub, abs_start, best_score))
|
|
return matches
|