feat: init
This commit is contained in:
49
core/matcher.py
Normal file
49
core/matcher.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import re
|
||||
from difflib import SequenceMatcher
|
||||
from typing import List, Tuple
|
||||
|
||||
from config import SYNC_CONFIG
|
||||
from .types import SubtitleEntry, WhisperSegment
|
||||
|
||||
|
||||
class TextMatcher:
|
||||
@staticmethod
|
||||
def normalize_text(text: str) -> str:
|
||||
return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip()
|
||||
|
||||
@staticmethod
|
||||
def text_similarity(text1: str, text2: str) -> float:
|
||||
n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2)
|
||||
if not n1 or not n2: return 0.0
|
||||
return SequenceMatcher(None, n1, n2).ratio()
|
||||
|
||||
@staticmethod
|
||||
def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \
|
||||
List[Tuple[SubtitleEntry, int, float]]:
|
||||
matches = []
|
||||
window = SYNC_CONFIG['search_window_sec'] * 1000
|
||||
scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000
|
||||
|
||||
# Optimization: Pre-filter subtitles
|
||||
relevant_subs = [
|
||||
s for s in subtitles
|
||||
if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window)
|
||||
]
|
||||
|
||||
for w_seg in whisper_segments:
|
||||
abs_start = w_seg.start_ms + chunk_start_ms
|
||||
best_sub = None
|
||||
best_score = 0.0
|
||||
|
||||
for sub in relevant_subs:
|
||||
if not (abs_start - window <= sub.start_ms <= abs_start + window): continue
|
||||
if len(sub.raw_text) < 3: continue
|
||||
|
||||
score = TextMatcher.text_similarity(sub.raw_text, w_seg.text)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_sub = sub
|
||||
|
||||
if best_sub and best_score >= SYNC_CONFIG['min_match_score']:
|
||||
matches.append((best_sub, abs_start, best_score))
|
||||
return matches
|
||||
Reference in New Issue
Block a user