feat: init

This commit is contained in:
2026-01-30 19:41:16 +01:00
commit 76581db30b
9 changed files with 467 additions and 0 deletions

49
core/matcher.py Normal file
View File

@@ -0,0 +1,49 @@
import re
from difflib import SequenceMatcher
from typing import List, Tuple
from config import SYNC_CONFIG
from .types import SubtitleEntry, WhisperSegment
class TextMatcher:
@staticmethod
def normalize_text(text: str) -> str:
return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip()
@staticmethod
def text_similarity(text1: str, text2: str) -> float:
n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2)
if not n1 or not n2: return 0.0
return SequenceMatcher(None, n1, n2).ratio()
@staticmethod
def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \
List[Tuple[SubtitleEntry, int, float]]:
matches = []
window = SYNC_CONFIG['search_window_sec'] * 1000
scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000
# Optimization: Pre-filter subtitles
relevant_subs = [
s for s in subtitles
if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window)
]
for w_seg in whisper_segments:
abs_start = w_seg.start_ms + chunk_start_ms
best_sub = None
best_score = 0.0
for sub in relevant_subs:
if not (abs_start - window <= sub.start_ms <= abs_start + window): continue
if len(sub.raw_text) < 3: continue
score = TextMatcher.text_similarity(sub.raw_text, w_seg.text)
if score > best_score:
best_score = score
best_sub = sub
if best_sub and best_score >= SYNC_CONFIG['min_match_score']:
matches.append((best_sub, abs_start, best_score))
return matches