"""
Content analysis helpers for word-category detection and result assembly.
"""

import re
from functools import lru_cache
from typing import Any

from app.core.logging import get_logger
from app.services.analysis_utils import AnalysisUtils

logger = get_logger("content_analysis")


_NON_ALPHA_RE = re.compile(r"[^a-zA-Z']")


@lru_cache(maxsize=32)
def _load_targets(list_file: str) -> set[str]:
    # Cache targets as a set for repeated calls
    try:
        word_list = AnalysisUtils.load_word_list(list_file)
        return set(word_list)
    except Exception as e:
        logger.error(f"_load_targets - failed to load {list_file}: {e}")
        return set()


def analyze_word_category(
    transcription: dict[str, Any], list_file: str
) -> dict[str, Any]:
    """Generic word category analyzer using a word list file under resources/word_lists."""
    try:
        words = transcription.get("words", [])
        targets = _load_targets(list_file)

        instances: list[dict[str, Any]] = []
        word_counts: dict[str, int] = {}
        word_timestamps: dict[str, list[str]] = {}

        for word_info in words:
            # Normalize token similar to legacy logic
            word = _NON_ALPHA_RE.sub("", (word_info.get("word") or "").lower())
            if word and word in targets:
                instances.append(
                    {
                        "word": word,
                        "timestamp": word_info.get("start"),
                        "end": word_info.get("end"),
                    }
                )
                word_counts[word] = word_counts.get(word, 0) + 1

                # Build word-to-timestamp mapping for legacy format
                timestamp_str = AnalysisUtils().seconds_to_mmss(word_info.get("start"))
                if word not in word_timestamps:
                    word_timestamps[word] = []
                word_timestamps[word].append(timestamp_str)

        # Convert word_timestamps dict to list of individual word dicts for legacy format
        timestamps_legacy = [
            {word: timestamps} for word, timestamps in word_timestamps.items()
        ]

        result = {
            "count": len(instances),
            "timestamps": timestamps_legacy,
            "word_counts": word_counts,
        }

        return result

    except Exception as e:
        logger.warning(f"analyze_word_category failed for {list_file}: {e}")
        return {"count": 0, "timestamps": [], "word_counts": {}}


def analyze_phrases(transcription: dict[str, Any], phrase_file: str) -> dict[str, Any]:
    """Analyze phrases in transcription text using phrase list file."""
    try:
        full_text = transcription.get("text", "").lower()
        if not full_text:
            return {"phrases": [], "phrase_counter": "No Phrases", "total_count": 0}

        # Load phrase list
        phrase_list = _load_targets(phrase_file)
        if not phrase_list:
            logger.warning(f"No phrases loaded from {phrase_file}")
            return {"phrases": [], "phrase_counter": "No Phrases", "total_count": 0}

        # Convert set to list for iteration
        phrases = list(phrase_list)
        phrase_results = []
        total_count = 0

        # Search for each phrase in the text
        for phrase in phrases:
            # Use regex to find all occurrences of the phrase
            matches = re.findall(re.escape(phrase), full_text)
            if matches:
                count = len(matches)
                phrase_data = {phrase: count}
                phrase_results.append(phrase_data)
                total_count += count

        # Determine phrase counter message
        if total_count > 0:
            phrase_counter = total_count
        else:
            phrase_counter = "No Phrases"

        return {
            "phrases": phrase_results,
            "phrase_counter": phrase_counter,
            "total_count": total_count,
        }

    except Exception as e:
        logger.warning(f"analyze_phrases failed for {phrase_file}: {e}")
        return {"phrases": [], "phrase_counter": "No Phrases", "total_count": 0}