feat: init

2026-01-30 19:41:16 +01:00
commit 76581db30b
9 changed files with 467 additions and 0 deletions
--- a/core/matcher.py
+++ b/core/matcher.py
@@ -0,0 +1,49 @@
+import re
+from difflib import SequenceMatcher
+from typing import List, Tuple
+
+from config import SYNC_CONFIG
+from .types import SubtitleEntry, WhisperSegment
+
+
+class TextMatcher:
+    @staticmethod
+    def normalize_text(text: str) -> str:
+        return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip()
+
+    @staticmethod
+    def text_similarity(text1: str, text2: str) -> float:
+        n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2)
+        if not n1 or not n2: return 0.0
+        return SequenceMatcher(None, n1, n2).ratio()
+
+    @staticmethod
+    def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \
+            List[Tuple[SubtitleEntry, int, float]]:
+        matches = []
+        window = SYNC_CONFIG['search_window_sec'] * 1000
+        scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000
+
+        # Optimization: Pre-filter subtitles
+        relevant_subs = [
+            s for s in subtitles
+            if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window)
+        ]
+
+        for w_seg in whisper_segments:
+            abs_start = w_seg.start_ms + chunk_start_ms
+            best_sub = None
+            best_score = 0.0
+
+            for sub in relevant_subs:
+                if not (abs_start - window <= sub.start_ms <= abs_start + window): continue
+                if len(sub.raw_text) < 3: continue
+
+                score = TextMatcher.text_similarity(sub.raw_text, w_seg.text)
+                if score > best_score:
+                    best_score = score
+                    best_sub = sub
+
+            if best_sub and best_score >= SYNC_CONFIG['min_match_score']:
+                matches.append((best_sub, abs_start, best_score))
+        return matches