first commit

2026-03-21 23:32:59 +07:00
commit cf193d7ea0
57 changed files with 17871 additions and 0 deletions
--- a/app/services/normalization.py
+++ b/app/services/normalization.py
@@ -0,0 +1,538 @@
+"""
+Dynamic Normalization Service.
+
+Implements dynamic normalization with real-time calculation of rataan and SB
+for each tryout. Supports multiple normalization modes:
+- Static: Use hardcoded rataan/SB from config
+- Dynamic: Calculate rataan/SB from participant NM scores in real-time
+- Hybrid: Use static until threshold reached, then switch to dynamic
+"""
+
+import logging
+import math
+from datetime import datetime, timezone
+from typing import Literal, Optional, Tuple
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.tryout import Tryout
+from app.models.tryout_stats import TryoutStats
+
+logger = logging.getLogger(__name__)
+
+
+async def calculate_dynamic_stats(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+) -> Tuple[Optional[float], Optional[float]]:
+    """
+    Calculate current dynamic stats (rataan and SB) from TryoutStats.
+
+    Fetches current TryoutStats for this (tryout_id, website_id) pair
+    and returns the calculated rataan and SB values.
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+
+    Returns:
+        Tuple of (rataan, sb), both None if no stats exist
+    """
+    result = await db.execute(
+        select(TryoutStats).where(
+            TryoutStats.website_id == website_id,
+            TryoutStats.tryout_id == tryout_id,
+        )
+    )
+    stats = result.scalar_one_or_none()
+
+    if stats is None:
+        return None, None
+
+    return stats.rataan, stats.sb
+
+
+async def update_dynamic_normalization(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+    nm: int,
+) -> Tuple[float, float]:
+    """
+    Update dynamic normalization with new NM score.
+
+    Fetches current TryoutStats and incrementally updates it with the new NM:
+    - Increments participant_count by 1
+    - Adds NM to total_nm_sum
+    - Adds NM² to total_nm_sq_sum
+    - Recalculates rataan and sb
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+        nm: Nilai Mentah (raw score) to add
+
+    Returns:
+        Tuple of updated (rataan, sb)
+
+    Raises:
+        ValueError: If nm is out of valid range [0, 1000]
+    """
+    if not 0 <= nm <= 1000:
+        raise ValueError(f"nm must be in range [0, 1000], got {nm}")
+
+    result = await db.execute(
+        select(TryoutStats).where(
+            TryoutStats.website_id == website_id,
+            TryoutStats.tryout_id == tryout_id,
+        )
+    )
+    stats = result.scalar_one_or_none()
+
+    if stats is None:
+        # Initialize new stats record
+        stats = TryoutStats(
+            website_id=website_id,
+            tryout_id=tryout_id,
+            participant_count=1,
+            total_nm_sum=float(nm),
+            total_nm_sq_sum=float(nm * nm),
+            rataan=float(nm),
+            sb=0.0,  # SD is 0 for single data point
+            min_nm=nm,
+            max_nm=nm,
+            last_calculated=datetime.now(timezone.utc),
+        )
+        db.add(stats)
+    else:
+        # Incrementally update existing stats
+        stats.participant_count += 1
+        stats.total_nm_sum += nm
+        stats.total_nm_sq_sum += nm * nm
+
+        # Update min/max
+        if stats.min_nm is None or nm < stats.min_nm:
+            stats.min_nm = nm
+        if stats.max_nm is None or nm > stats.max_nm:
+            stats.max_nm = nm
+
+        # Recalculate mean and SD
+        n = stats.participant_count
+        sum_nm = stats.total_nm_sum
+        sum_nm_sq = stats.total_nm_sq_sum
+
+        # Mean = Σ NM / n
+        mean = sum_nm / n
+        stats.rataan = mean
+
+        # Variance = (Σ NM² / n) - (mean)²
+        # Using population standard deviation
+        if n > 1:
+            variance = (sum_nm_sq / n) - (mean ** 2)
+            # Clamp variance to non-negative (handles floating point errors)
+            variance = max(0.0, variance)
+            stats.sb = math.sqrt(variance)
+        else:
+            stats.sb = 0.0
+
+        stats.last_calculated = datetime.now(timezone.utc)
+
+    await db.flush()
+
+    logger.info(
+        f"Updated dynamic normalization for tryout {tryout_id}, "
+        f"website {website_id}: participant_count={stats.participant_count}, "
+        f"rataan={stats.rataan:.2f}, sb={stats.sb:.2f}"
+    )
+
+    # rataan and sb are always set by this function
+    assert stats.rataan is not None
+    assert stats.sb is not None
+    return stats.rataan, stats.sb
+
+
+def apply_normalization(
+    nm: int,
+    rataan: float,
+    sb: float,
+) -> int:
+    """
+    Apply normalization to NM to get NN (Nilai Nasional).
+
+    Formula: NN = 500 + 100 × ((NM - Rataan) / SB)
+
+    Normalizes scores to mean=500, SD=100 distribution.
+
+    Args:
+        nm: Nilai Mentah (raw score) in range [0, 1000]
+        rataan: Mean of NM scores
+        sb: Standard deviation of NM scores
+
+    Returns:
+        NN (normalized score) in range [0, 1000]
+
+    Raises:
+        ValueError: If nm is out of range or sb is invalid
+    """
+    if not 0 <= nm <= 1000:
+        raise ValueError(f"nm must be in range [0, 1000], got {nm}")
+    if sb <= 0:
+        # If SD is 0 or negative, return default normalized score
+        # This handles edge case where all scores are identical
+        return 500
+
+    # Calculate normalized score
+    z_score = (nm - rataan) / sb
+    nn = 500 + 100 * z_score
+
+    # Round to integer and clamp to valid range [0, 1000]
+    nn_int = round(nn)
+    return max(0, min(1000, nn_int))
+
+
+async def get_normalization_mode(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+) -> Literal["static", "dynamic", "hybrid"]:
+    """
+    Get the current normalization mode for a tryout.
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+
+    Returns:
+        Normalization mode: "static", "dynamic", or "hybrid"
+
+    Raises:
+        ValueError: If tryout not found
+    """
+    result = await db.execute(
+        select(Tryout).where(
+            Tryout.website_id == website_id,
+            Tryout.tryout_id == tryout_id,
+        )
+    )
+    tryout = result.scalar_one_or_none()
+
+    if tryout is None:
+        raise ValueError(
+            f"Tryout {tryout_id} not found for website {website_id}"
+        )
+
+    return tryout.normalization_mode
+
+
+async def check_threshold_for_dynamic(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+) -> bool:
+    """
+    Check if participant count meets threshold for dynamic normalization.
+
+    Compares current participant_count with min_sample_for_dynamic from config.
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+
+    Returns:
+        True if participant_count >= min_sample_for_dynamic, else False
+    """
+    # Fetch current TryoutStats
+    stats_result = await db.execute(
+        select(TryoutStats).where(
+            TryoutStats.website_id == website_id,
+            TryoutStats.tryout_id == tryout_id,
+        )
+    )
+    stats = stats_result.scalar_one_or_none()
+    current_participant_count = stats.participant_count if stats else 0
+
+    # Fetch min_sample_for_dynamic from config
+    tryout_result = await db.execute(
+        select(Tryout.min_sample_for_dynamic).where(
+            Tryout.website_id == website_id,
+            Tryout.tryout_id == tryout_id,
+        )
+    )
+    min_sample = tryout_result.scalar_one_or_none()
+
+    if min_sample is None:
+        # Default to 100 if not configured
+        min_sample = 100
+
+    return current_participant_count >= min_sample
+
+
+async def get_normalization_params(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+) -> Tuple[float, float, Literal["static", "dynamic"]]:
+    """
+    Get normalization parameters (rataan, sb) based on current mode.
+
+    Determines which normalization parameters to use:
+    - Static mode: Use config.static_rataan and config.static_sb
+    - Dynamic mode: Use calculated rataan and sb from TryoutStats
+    - Hybrid mode: Use static until threshold reached, then dynamic
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+
+    Returns:
+        Tuple of (rataan, sb, mode_used)
+
+    Raises:
+        ValueError: If tryout not found or dynamic stats unavailable
+    """
+    # Get normalization mode
+    mode = await get_normalization_mode(db, website_id, tryout_id)
+
+    if mode == "static":
+        # Use static values from config
+        result = await db.execute(
+            select(Tryout.static_rataan, Tryout.static_sb).where(
+                Tryout.website_id == website_id,
+                Tryout.tryout_id == tryout_id,
+            )
+        )
+        row = result.scalar_one_or_none()
+
+        if row is None:
+            raise ValueError(
+                f"Tryout {tryout_id} not found for website {website_id}"
+            )
+
+        rataan, sb = row
+        return rataan, sb, "static"
+
+    elif mode == "dynamic":
+        # Use dynamic values from stats
+        rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
+
+        if rataan is None or sb is None:
+            raise ValueError(
+                f"Dynamic normalization not available for tryout {tryout_id}. "
+                "No stats have been calculated yet."
+            )
+
+        if sb == 0:
+            logger.warning(
+                f"Standard deviation is 0 for tryout {tryout_id}. "
+                "All NM scores are identical."
+            )
+
+        return rataan, sb, "dynamic"
+
+    else:  # hybrid
+        # Check threshold
+        threshold_met = await check_threshold_for_dynamic(db, website_id, tryout_id)
+
+        if threshold_met:
+            # Use dynamic values
+            rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
+
+            if rataan is None or sb is None:
+                # Fallback to static if dynamic not available
+                result = await db.execute(
+                    select(Tryout.static_rataan, Tryout.static_sb).where(
+                        Tryout.website_id == website_id,
+                        Tryout.tryout_id == tryout_id,
+                    )
+                )
+                row = result.scalar_one_or_none()
+                if row is None:
+                    raise ValueError(
+                        f"Tryout {tryout_id} not found for website {website_id}"
+                    )
+                rataan, sb = row
+                return rataan, sb, "static"
+
+            return rataan, sb, "dynamic"
+        else:
+            # Use static values
+            result = await db.execute(
+                select(Tryout.static_rataan, Tryout.static_sb).where(
+                    Tryout.website_id == website_id,
+                    Tryout.tryout_id == tryout_id,
+                )
+            )
+            row = result.scalar_one_or_none()
+            if row is None:
+                raise ValueError(
+                    f"Tryout {tryout_id} not found for website {website_id}"
+                )
+            rataan, sb = row
+            return rataan, sb, "static"
+
+
+async def calculate_skewness(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+) -> Optional[float]:
+    """
+    Calculate skewness of NM distribution for validation.
+
+    Skewness measures the asymmetry of the probability distribution.
+    Values:
+    - Skewness ≈ 0: Symmetric distribution
+    - Skewness > 0: Right-skewed (tail to the right)
+    - Skewness < 0: Left-skewed (tail to the left)
+
+    Formula: Skewness = (n / ((n-1)(n-2))) * Σ((x - mean) / SD)³
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+
+    Returns:
+        Skewness value, or None if insufficient data
+    """
+    result = await db.execute(
+        select(TryoutStats).where(
+            TryoutStats.website_id == website_id,
+            TryoutStats.tryout_id == tryout_id,
+        )
+    )
+    stats = result.scalar_one_or_none()
+
+    if stats is None or stats.participant_count < 3:
+        # Need at least 3 samples for skewness calculation
+        return None
+
+    n = stats.participant_count
+    mean = stats.rataan
+    sd = stats.sb
+
+    if sd == 0:
+        return 0.0  # All values are identical
+
+    # Calculate skewness
+    # We need individual NM values, which we don't have in TryoutStats
+    # For now, return None as we need a different approach
+    # This would require storing all NM values or calculating on-the-fly
+    return None
+
+
+async def validate_dynamic_normalization(
+    db: AsyncSession,
+    website_id: int,
+    tryout_id: str,
+    target_mean: float = 500.0,
+    target_sd: float = 100.0,
+    mean_tolerance: float = 5.0,
+    sd_tolerance: float = 5.0,
+) -> Tuple[bool, dict]:
+    """
+    Validate that dynamic normalization produces expected distribution.
+
+    Checks if calculated rataan and sb are close to target values.
+
+    Args:
+        db: Async database session
+        website_id: Website identifier
+        tryout_id: Tryout identifier
+        target_mean: Target mean (default: 500)
+        target_sd: Target standard deviation (default: 100)
+        mean_tolerance: Allowed deviation from target mean (default: 5)
+        sd_tolerance: Allowed deviation from target SD (default: 5)
+
+    Returns:
+        Tuple of (is_valid, validation_details)
+
+        validation_details contains:
+        - participant_count: Number of participants
+        - current_rataan: Current mean
+        - current_sb: Current standard deviation
+        - mean_deviation: Absolute deviation from target mean
+        - sd_deviation: Absolute deviation from target SD
+        - mean_within_tolerance: True if mean deviation < mean_tolerance
+        - sd_within_tolerance: True if SD deviation < sd_tolerance
+        - warnings: List of warning messages
+        - suggestions: List of suggestions
+    """
+    # Get current stats
+    result = await db.execute(
+        select(TryoutStats).where(
+            TryoutStats.website_id == website_id,
+            TryoutStats.tryout_id == tryout_id,
+        )
+    )
+    stats = result.scalar_one_or_none()
+
+    if stats is None or stats.rataan is None or stats.sb is None:
+        return False, {
+            "participant_count": 0,
+            "current_rataan": None,
+            "current_sb": None,
+            "mean_deviation": None,
+            "sd_deviation": None,
+            "mean_within_tolerance": False,
+            "sd_within_tolerance": False,
+            "warnings": ["No statistics available for validation"],
+            "suggestions": ["Wait for more participants to complete sessions"],
+        }
+
+    # Calculate deviations
+    mean_deviation = abs(stats.rataan - target_mean)
+    sd_deviation = abs(stats.sb - target_sd)
+
+    # Check tolerance
+    mean_within_tolerance = mean_deviation <= mean_tolerance
+    sd_within_tolerance = sd_deviation <= sd_tolerance
+
+    is_valid = mean_within_tolerance and sd_within_tolerance
+
+    # Generate warnings
+    warnings = []
+    suggestions = []
+
+    if not mean_within_tolerance:
+        warnings.append(f"Mean deviation ({mean_deviation:.2f}) exceeds tolerance ({mean_tolerance})")
+        if stats.rataan > target_mean:
+            suggestions.append("Distribution may be right-skewed - consider checking question difficulty")
+        else:
+            suggestions.append("Distribution may be left-skewed - consider checking question difficulty")
+
+    if not sd_within_tolerance:
+        warnings.append(f"SD deviation ({sd_deviation:.2f}) exceeds tolerance ({sd_tolerance})")
+        if stats.sb < target_sd:
+            suggestions.append("SD too low - scores may be too tightly clustered")
+        else:
+            suggestions.append("SD too high - scores may have too much variance")
+
+    # Check for skewness
+    skewness = await calculate_skewness(db, website_id, tryout_id)
+    if skewness is not None and abs(skewness) > 0.5:
+        warnings.append(f"Distribution skewness ({skewness:.2f}) > 0.5 - distribution may be asymmetric")
+        suggestions.append("Consider using static normalization if dynamic normalization is unstable")
+
+    # Check participant count
+    if stats.participant_count < 100:
+        suggestions.append(f"Participant count ({stats.participant_count}) below recommended minimum (100)")
+
+    return is_valid, {
+        "participant_count": stats.participant_count,
+        "current_rataan": stats.rataan,
+        "current_sb": stats.sb,
+        "mean_deviation": mean_deviation,
+        "sd_deviation": sd_deviation,
+        "mean_within_tolerance": mean_within_tolerance,
+        "sd_within_tolerance": sd_within_tolerance,
+        "warnings": warnings,
+        "suggestions": suggestions,
+    }