subtitles-sync-whisper/core/matcher.py

import re
from difflib import SequenceMatcher
from typing import List, Tuple

from config import SYNC_CONFIG
from .types import SubtitleEntry, WhisperSegment


class TextMatcher:
    @staticmethod
    def normalize_text(text: str) -> str:
        return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text).lower().replace('\n', ' ')).strip()

    @staticmethod
    def text_similarity(text1: str, text2: str) -> float:
        n1, n2 = TextMatcher.normalize_text(text1), TextMatcher.normalize_text(text2)
        if not n1 or not n2: return 0.0
        return SequenceMatcher(None, n1, n2).ratio()

    @staticmethod
    def find_matches(subtitles: List[SubtitleEntry], whisper_segments: List[WhisperSegment], chunk_start_ms: int) -> \
            List[Tuple[SubtitleEntry, int, float]]:
        matches = []
        window = SYNC_CONFIG['search_window_sec'] * 1000
        scan_dur = SYNC_CONFIG['scan_duration_sec'] * 1000

        # Optimization: Pre-filter subtitles
        relevant_subs = [
            s for s in subtitles
            if (chunk_start_ms - window) <= s.start_ms <= (chunk_start_ms + scan_dur + window)
        ]

        for w_seg in whisper_segments:
            abs_start = w_seg.start_ms + chunk_start_ms
            best_sub = None
            best_score = 0.0

            for sub in relevant_subs:
                if not (abs_start - window <= sub.start_ms <= abs_start + window): continue
                if len(sub.raw_text) < 3: continue

                score = TextMatcher.text_similarity(sub.raw_text, w_seg.text)
                if score > best_score:
                    best_score = score
                    best_sub = sub

            if best_sub and best_score >= SYNC_CONFIG['min_match_score']:
                matches.append((best_sub, abs_start, best_score))
        return matches