yellow-bank-soal/app/services/normalization.py

"""
Dynamic Normalization Service.

Implements dynamic normalization with real-time calculation of rataan and SB
for each tryout. Supports multiple normalization modes:
- Static: Use hardcoded rataan/SB from config
- Dynamic: Calculate rataan/SB from participant NM scores in real-time
- Hybrid: Use static until threshold reached, then switch to dynamic
"""

import logging
import math
from datetime import datetime, timezone
from typing import Literal, Optional, Tuple

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.models.tryout import Tryout
from app.models.tryout_stats import TryoutStats

logger = logging.getLogger(__name__)


async def calculate_dynamic_stats(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
) -> Tuple[Optional[float], Optional[float]]:
    """
    Calculate current dynamic stats (rataan and SB) from TryoutStats.

    Fetches current TryoutStats for this (tryout_id, website_id) pair
    and returns the calculated rataan and SB values.

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier

    Returns:
        Tuple of (rataan, sb), both None if no stats exist
    """
    result = await db.execute(
        select(TryoutStats).where(
            TryoutStats.website_id == website_id,
            TryoutStats.tryout_id == tryout_id,
        )
    )
    stats = result.scalar_one_or_none()

    if stats is None:
        return None, None

    return stats.rataan, stats.sb


async def update_dynamic_normalization(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
    nm: int,
) -> Tuple[float, float]:
    """
    Update dynamic normalization with new NM score.

    Fetches current TryoutStats and incrementally updates it with the new NM:
    - Increments participant_count by 1
    - Adds NM to total_nm_sum
    - Adds NM² to total_nm_sq_sum
    - Recalculates rataan and sb

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier
        nm: Nilai Mentah (raw score) to add

    Returns:
        Tuple of updated (rataan, sb)

    Raises:
        ValueError: If nm is out of valid range [0, 1000]
    """
    if not 0 <= nm <= 1000:
        raise ValueError(f"nm must be in range [0, 1000], got {nm}")

    result = await db.execute(
        select(TryoutStats).where(
            TryoutStats.website_id == website_id,
            TryoutStats.tryout_id == tryout_id,
        )
    )
    stats = result.scalar_one_or_none()

    if stats is None:
        # Initialize new stats record
        stats = TryoutStats(
            website_id=website_id,
            tryout_id=tryout_id,
            participant_count=1,
            total_nm_sum=float(nm),
            total_nm_sq_sum=float(nm * nm),
            rataan=float(nm),
            sb=0.0,  # SD is 0 for single data point
            min_nm=nm,
            max_nm=nm,
            last_calculated=datetime.now(timezone.utc),
        )
        db.add(stats)
    else:
        # Incrementally update existing stats
        stats.participant_count += 1
        stats.total_nm_sum += nm
        stats.total_nm_sq_sum += nm * nm

        # Update min/max
        if stats.min_nm is None or nm < stats.min_nm:
            stats.min_nm = nm
        if stats.max_nm is None or nm > stats.max_nm:
            stats.max_nm = nm

        # Recalculate mean and SD
        n = stats.participant_count
        sum_nm = stats.total_nm_sum
        sum_nm_sq = stats.total_nm_sq_sum

        # Mean = Σ NM / n
        mean = sum_nm / n
        stats.rataan = mean

        # Variance = (Σ NM² / n) - (mean)²
        # Using population standard deviation
        if n > 1:
            variance = (sum_nm_sq / n) - (mean ** 2)
            # Clamp variance to non-negative (handles floating point errors)
            variance = max(0.0, variance)
            stats.sb = math.sqrt(variance)
        else:
            stats.sb = 0.0

        stats.last_calculated = datetime.now(timezone.utc)

    await db.flush()

    logger.info(
        f"Updated dynamic normalization for tryout {tryout_id}, "
        f"website {website_id}: participant_count={stats.participant_count}, "
        f"rataan={stats.rataan:.2f}, sb={stats.sb:.2f}"
    )

    # rataan and sb are always set by this function
    assert stats.rataan is not None
    assert stats.sb is not None
    return stats.rataan, stats.sb


def apply_normalization(
    nm: int,
    rataan: float,
    sb: float,
) -> int:
    """
    Apply normalization to NM to get NN (Nilai Nasional).

    Formula: NN = 500 + 100 × ((NM - Rataan) / SB)

    Normalizes scores to mean=500, SD=100 distribution.

    Args:
        nm: Nilai Mentah (raw score) in range [0, 1000]
        rataan: Mean of NM scores
        sb: Standard deviation of NM scores

    Returns:
        NN (normalized score) in range [0, 1000]

    Raises:
        ValueError: If nm is out of range or sb is invalid
    """
    if not 0 <= nm <= 1000:
        raise ValueError(f"nm must be in range [0, 1000], got {nm}")
    if sb <= 0:
        # If SD is 0 or negative, return default normalized score
        # This handles edge case where all scores are identical
        return 500

    # Calculate normalized score
    z_score = (nm - rataan) / sb
    nn = 500 + 100 * z_score

    # Round to integer and clamp to valid range [0, 1000]
    nn_int = round(nn)
    return max(0, min(1000, nn_int))


async def get_normalization_mode(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
) -> Literal["static", "dynamic", "hybrid"]:
    """
    Get the current normalization mode for a tryout.

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier

    Returns:
        Normalization mode: "static", "dynamic", or "hybrid"

    Raises:
        ValueError: If tryout not found
    """
    result = await db.execute(
        select(Tryout).where(
            Tryout.website_id == website_id,
            Tryout.tryout_id == tryout_id,
        )
    )
    tryout = result.scalar_one_or_none()

    if tryout is None:
        raise ValueError(
            f"Tryout {tryout_id} not found for website {website_id}"
        )

    return tryout.normalization_mode


async def check_threshold_for_dynamic(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
) -> bool:
    """
    Check if participant count meets threshold for dynamic normalization.

    Compares current participant_count with min_sample_for_dynamic from config.

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier

    Returns:
        True if participant_count >= min_sample_for_dynamic, else False
    """
    # Fetch current TryoutStats
    stats_result = await db.execute(
        select(TryoutStats).where(
            TryoutStats.website_id == website_id,
            TryoutStats.tryout_id == tryout_id,
        )
    )
    stats = stats_result.scalar_one_or_none()
    current_participant_count = stats.participant_count if stats else 0

    # Fetch min_sample_for_dynamic from config
    tryout_result = await db.execute(
        select(Tryout.min_sample_for_dynamic).where(
            Tryout.website_id == website_id,
            Tryout.tryout_id == tryout_id,
        )
    )
    min_sample = tryout_result.scalar_one_or_none()

    if min_sample is None:
        # Default to 100 if not configured
        min_sample = 100

    return current_participant_count >= min_sample


async def get_normalization_params(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
) -> Tuple[float, float, Literal["static", "dynamic"]]:
    """
    Get normalization parameters (rataan, sb) based on current mode.

    Determines which normalization parameters to use:
    - Static mode: Use config.static_rataan and config.static_sb
    - Dynamic mode: Use calculated rataan and sb from TryoutStats
    - Hybrid mode: Use static until threshold reached, then dynamic

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier

    Returns:
        Tuple of (rataan, sb, mode_used)

    Raises:
        ValueError: If tryout not found or dynamic stats unavailable
    """
    # Get normalization mode
    mode = await get_normalization_mode(db, website_id, tryout_id)

    if mode == "static":
        # Use static values from config
        result = await db.execute(
            select(Tryout.static_rataan, Tryout.static_sb).where(
                Tryout.website_id == website_id,
                Tryout.tryout_id == tryout_id,
            )
        )
        row = result.scalar_one_or_none()

        if row is None:
            raise ValueError(
                f"Tryout {tryout_id} not found for website {website_id}"
            )

        rataan, sb = row
        return rataan, sb, "static"

    elif mode == "dynamic":
        # Use dynamic values from stats
        rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)

        if rataan is None or sb is None:
            raise ValueError(
                f"Dynamic normalization not available for tryout {tryout_id}. "
                "No stats have been calculated yet."
            )

        if sb == 0:
            logger.warning(
                f"Standard deviation is 0 for tryout {tryout_id}. "
                "All NM scores are identical."
            )

        return rataan, sb, "dynamic"

    else:  # hybrid
        # Check threshold
        threshold_met = await check_threshold_for_dynamic(db, website_id, tryout_id)

        if threshold_met:
            # Use dynamic values
            rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)

            if rataan is None or sb is None:
                # Fallback to static if dynamic not available
                result = await db.execute(
                    select(Tryout.static_rataan, Tryout.static_sb).where(
                        Tryout.website_id == website_id,
                        Tryout.tryout_id == tryout_id,
                    )
                )
                row = result.scalar_one_or_none()
                if row is None:
                    raise ValueError(
                        f"Tryout {tryout_id} not found for website {website_id}"
                    )
                rataan, sb = row
                return rataan, sb, "static"

            return rataan, sb, "dynamic"
        else:
            # Use static values
            result = await db.execute(
                select(Tryout.static_rataan, Tryout.static_sb).where(
                    Tryout.website_id == website_id,
                    Tryout.tryout_id == tryout_id,
                )
            )
            row = result.scalar_one_or_none()
            if row is None:
                raise ValueError(
                    f"Tryout {tryout_id} not found for website {website_id}"
                )
            rataan, sb = row
            return rataan, sb, "static"


async def calculate_skewness(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
) -> Optional[float]:
    """
    Calculate skewness of NM distribution for validation.

    Skewness measures the asymmetry of the probability distribution.
    Values:
    - Skewness ≈ 0: Symmetric distribution
    - Skewness > 0: Right-skewed (tail to the right)
    - Skewness < 0: Left-skewed (tail to the left)

    Formula: Skewness = (n / ((n-1)(n-2))) * Σ((x - mean) / SD)³

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier

    Returns:
        Skewness value, or None if insufficient data
    """
    result = await db.execute(
        select(TryoutStats).where(
            TryoutStats.website_id == website_id,
            TryoutStats.tryout_id == tryout_id,
        )
    )
    stats = result.scalar_one_or_none()

    if stats is None or stats.participant_count < 3:
        # Need at least 3 samples for skewness calculation
        return None

    n = stats.participant_count
    mean = stats.rataan
    sd = stats.sb

    if sd == 0:
        return 0.0  # All values are identical

    # Calculate skewness
    # We need individual NM values, which we don't have in TryoutStats
    # For now, return None as we need a different approach
    # This would require storing all NM values or calculating on-the-fly
    return None


async def validate_dynamic_normalization(
    db: AsyncSession,
    website_id: int,
    tryout_id: str,
    target_mean: float = 500.0,
    target_sd: float = 100.0,
    mean_tolerance: float = 5.0,
    sd_tolerance: float = 5.0,
) -> Tuple[bool, dict]:
    """
    Validate that dynamic normalization produces expected distribution.

    Checks if calculated rataan and sb are close to target values.

    Args:
        db: Async database session
        website_id: Website identifier
        tryout_id: Tryout identifier
        target_mean: Target mean (default: 500)
        target_sd: Target standard deviation (default: 100)
        mean_tolerance: Allowed deviation from target mean (default: 5)
        sd_tolerance: Allowed deviation from target SD (default: 5)

    Returns:
        Tuple of (is_valid, validation_details)

        validation_details contains:
        - participant_count: Number of participants
        - current_rataan: Current mean
        - current_sb: Current standard deviation
        - mean_deviation: Absolute deviation from target mean
        - sd_deviation: Absolute deviation from target SD
        - mean_within_tolerance: True if mean deviation < mean_tolerance
        - sd_within_tolerance: True if SD deviation < sd_tolerance
        - warnings: List of warning messages
        - suggestions: List of suggestions
    """
    # Get current stats
    result = await db.execute(
        select(TryoutStats).where(
            TryoutStats.website_id == website_id,
            TryoutStats.tryout_id == tryout_id,
        )
    )
    stats = result.scalar_one_or_none()

    if stats is None or stats.rataan is None or stats.sb is None:
        return False, {
            "participant_count": 0,
            "current_rataan": None,
            "current_sb": None,
            "mean_deviation": None,
            "sd_deviation": None,
            "mean_within_tolerance": False,
            "sd_within_tolerance": False,
            "warnings": ["No statistics available for validation"],
            "suggestions": ["Wait for more participants to complete sessions"],
        }

    # Calculate deviations
    mean_deviation = abs(stats.rataan - target_mean)
    sd_deviation = abs(stats.sb - target_sd)

    # Check tolerance
    mean_within_tolerance = mean_deviation <= mean_tolerance
    sd_within_tolerance = sd_deviation <= sd_tolerance

    is_valid = mean_within_tolerance and sd_within_tolerance

    # Generate warnings
    warnings = []
    suggestions = []

    if not mean_within_tolerance:
        warnings.append(f"Mean deviation ({mean_deviation:.2f}) exceeds tolerance ({mean_tolerance})")
        if stats.rataan > target_mean:
            suggestions.append("Distribution may be right-skewed - consider checking question difficulty")
        else:
            suggestions.append("Distribution may be left-skewed - consider checking question difficulty")

    if not sd_within_tolerance:
        warnings.append(f"SD deviation ({sd_deviation:.2f}) exceeds tolerance ({sd_tolerance})")
        if stats.sb < target_sd:
            suggestions.append("SD too low - scores may be too tightly clustered")
        else:
            suggestions.append("SD too high - scores may have too much variance")

    # Check for skewness
    skewness = await calculate_skewness(db, website_id, tryout_id)
    if skewness is not None and abs(skewness) > 0.5:
        warnings.append(f"Distribution skewness ({skewness:.2f}) > 0.5 - distribution may be asymmetric")
        suggestions.append("Consider using static normalization if dynamic normalization is unstable")

    # Check participant count
    if stats.participant_count < 100:
        suggestions.append(f"Participant count ({stats.participant_count}) below recommended minimum (100)")

    return is_valid, {
        "participant_count": stats.participant_count,
        "current_rataan": stats.rataan,
        "current_sb": stats.sb,
        "mean_deviation": mean_deviation,
        "sd_deviation": sd_deviation,
        "mean_within_tolerance": mean_within_tolerance,
        "sd_within_tolerance": sd_within_tolerance,
        "warnings": warnings,
        "suggestions": suggestions,
    }