Files
yellow-bank-soal/app/services/normalization.py
Dwindi Ramadhana cf193d7ea0 first commit
2026-03-21 23:32:59 +07:00

539 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Dynamic Normalization Service.
Implements dynamic normalization with real-time calculation of rataan and SB
for each tryout. Supports multiple normalization modes:
- Static: Use hardcoded rataan/SB from config
- Dynamic: Calculate rataan/SB from participant NM scores in real-time
- Hybrid: Use static until threshold reached, then switch to dynamic
"""
import logging
import math
from datetime import datetime, timezone
from typing import Literal, Optional, Tuple
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.tryout import Tryout
from app.models.tryout_stats import TryoutStats
logger = logging.getLogger(__name__)
async def calculate_dynamic_stats(
db: AsyncSession,
website_id: int,
tryout_id: str,
) -> Tuple[Optional[float], Optional[float]]:
"""
Calculate current dynamic stats (rataan and SB) from TryoutStats.
Fetches current TryoutStats for this (tryout_id, website_id) pair
and returns the calculated rataan and SB values.
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
Returns:
Tuple of (rataan, sb), both None if no stats exist
"""
result = await db.execute(
select(TryoutStats).where(
TryoutStats.website_id == website_id,
TryoutStats.tryout_id == tryout_id,
)
)
stats = result.scalar_one_or_none()
if stats is None:
return None, None
return stats.rataan, stats.sb
async def update_dynamic_normalization(
db: AsyncSession,
website_id: int,
tryout_id: str,
nm: int,
) -> Tuple[float, float]:
"""
Update dynamic normalization with new NM score.
Fetches current TryoutStats and incrementally updates it with the new NM:
- Increments participant_count by 1
- Adds NM to total_nm_sum
- Adds NM² to total_nm_sq_sum
- Recalculates rataan and sb
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
nm: Nilai Mentah (raw score) to add
Returns:
Tuple of updated (rataan, sb)
Raises:
ValueError: If nm is out of valid range [0, 1000]
"""
if not 0 <= nm <= 1000:
raise ValueError(f"nm must be in range [0, 1000], got {nm}")
result = await db.execute(
select(TryoutStats).where(
TryoutStats.website_id == website_id,
TryoutStats.tryout_id == tryout_id,
)
)
stats = result.scalar_one_or_none()
if stats is None:
# Initialize new stats record
stats = TryoutStats(
website_id=website_id,
tryout_id=tryout_id,
participant_count=1,
total_nm_sum=float(nm),
total_nm_sq_sum=float(nm * nm),
rataan=float(nm),
sb=0.0, # SD is 0 for single data point
min_nm=nm,
max_nm=nm,
last_calculated=datetime.now(timezone.utc),
)
db.add(stats)
else:
# Incrementally update existing stats
stats.participant_count += 1
stats.total_nm_sum += nm
stats.total_nm_sq_sum += nm * nm
# Update min/max
if stats.min_nm is None or nm < stats.min_nm:
stats.min_nm = nm
if stats.max_nm is None or nm > stats.max_nm:
stats.max_nm = nm
# Recalculate mean and SD
n = stats.participant_count
sum_nm = stats.total_nm_sum
sum_nm_sq = stats.total_nm_sq_sum
# Mean = Σ NM / n
mean = sum_nm / n
stats.rataan = mean
# Variance = (Σ NM² / n) - (mean)²
# Using population standard deviation
if n > 1:
variance = (sum_nm_sq / n) - (mean ** 2)
# Clamp variance to non-negative (handles floating point errors)
variance = max(0.0, variance)
stats.sb = math.sqrt(variance)
else:
stats.sb = 0.0
stats.last_calculated = datetime.now(timezone.utc)
await db.flush()
logger.info(
f"Updated dynamic normalization for tryout {tryout_id}, "
f"website {website_id}: participant_count={stats.participant_count}, "
f"rataan={stats.rataan:.2f}, sb={stats.sb:.2f}"
)
# rataan and sb are always set by this function
assert stats.rataan is not None
assert stats.sb is not None
return stats.rataan, stats.sb
def apply_normalization(
nm: int,
rataan: float,
sb: float,
) -> int:
"""
Apply normalization to NM to get NN (Nilai Nasional).
Formula: NN = 500 + 100 × ((NM - Rataan) / SB)
Normalizes scores to mean=500, SD=100 distribution.
Args:
nm: Nilai Mentah (raw score) in range [0, 1000]
rataan: Mean of NM scores
sb: Standard deviation of NM scores
Returns:
NN (normalized score) in range [0, 1000]
Raises:
ValueError: If nm is out of range or sb is invalid
"""
if not 0 <= nm <= 1000:
raise ValueError(f"nm must be in range [0, 1000], got {nm}")
if sb <= 0:
# If SD is 0 or negative, return default normalized score
# This handles edge case where all scores are identical
return 500
# Calculate normalized score
z_score = (nm - rataan) / sb
nn = 500 + 100 * z_score
# Round to integer and clamp to valid range [0, 1000]
nn_int = round(nn)
return max(0, min(1000, nn_int))
async def get_normalization_mode(
db: AsyncSession,
website_id: int,
tryout_id: str,
) -> Literal["static", "dynamic", "hybrid"]:
"""
Get the current normalization mode for a tryout.
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
Returns:
Normalization mode: "static", "dynamic", or "hybrid"
Raises:
ValueError: If tryout not found
"""
result = await db.execute(
select(Tryout).where(
Tryout.website_id == website_id,
Tryout.tryout_id == tryout_id,
)
)
tryout = result.scalar_one_or_none()
if tryout is None:
raise ValueError(
f"Tryout {tryout_id} not found for website {website_id}"
)
return tryout.normalization_mode
async def check_threshold_for_dynamic(
db: AsyncSession,
website_id: int,
tryout_id: str,
) -> bool:
"""
Check if participant count meets threshold for dynamic normalization.
Compares current participant_count with min_sample_for_dynamic from config.
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
Returns:
True if participant_count >= min_sample_for_dynamic, else False
"""
# Fetch current TryoutStats
stats_result = await db.execute(
select(TryoutStats).where(
TryoutStats.website_id == website_id,
TryoutStats.tryout_id == tryout_id,
)
)
stats = stats_result.scalar_one_or_none()
current_participant_count = stats.participant_count if stats else 0
# Fetch min_sample_for_dynamic from config
tryout_result = await db.execute(
select(Tryout.min_sample_for_dynamic).where(
Tryout.website_id == website_id,
Tryout.tryout_id == tryout_id,
)
)
min_sample = tryout_result.scalar_one_or_none()
if min_sample is None:
# Default to 100 if not configured
min_sample = 100
return current_participant_count >= min_sample
async def get_normalization_params(
db: AsyncSession,
website_id: int,
tryout_id: str,
) -> Tuple[float, float, Literal["static", "dynamic"]]:
"""
Get normalization parameters (rataan, sb) based on current mode.
Determines which normalization parameters to use:
- Static mode: Use config.static_rataan and config.static_sb
- Dynamic mode: Use calculated rataan and sb from TryoutStats
- Hybrid mode: Use static until threshold reached, then dynamic
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
Returns:
Tuple of (rataan, sb, mode_used)
Raises:
ValueError: If tryout not found or dynamic stats unavailable
"""
# Get normalization mode
mode = await get_normalization_mode(db, website_id, tryout_id)
if mode == "static":
# Use static values from config
result = await db.execute(
select(Tryout.static_rataan, Tryout.static_sb).where(
Tryout.website_id == website_id,
Tryout.tryout_id == tryout_id,
)
)
row = result.scalar_one_or_none()
if row is None:
raise ValueError(
f"Tryout {tryout_id} not found for website {website_id}"
)
rataan, sb = row
return rataan, sb, "static"
elif mode == "dynamic":
# Use dynamic values from stats
rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
if rataan is None or sb is None:
raise ValueError(
f"Dynamic normalization not available for tryout {tryout_id}. "
"No stats have been calculated yet."
)
if sb == 0:
logger.warning(
f"Standard deviation is 0 for tryout {tryout_id}. "
"All NM scores are identical."
)
return rataan, sb, "dynamic"
else: # hybrid
# Check threshold
threshold_met = await check_threshold_for_dynamic(db, website_id, tryout_id)
if threshold_met:
# Use dynamic values
rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
if rataan is None or sb is None:
# Fallback to static if dynamic not available
result = await db.execute(
select(Tryout.static_rataan, Tryout.static_sb).where(
Tryout.website_id == website_id,
Tryout.tryout_id == tryout_id,
)
)
row = result.scalar_one_or_none()
if row is None:
raise ValueError(
f"Tryout {tryout_id} not found for website {website_id}"
)
rataan, sb = row
return rataan, sb, "static"
return rataan, sb, "dynamic"
else:
# Use static values
result = await db.execute(
select(Tryout.static_rataan, Tryout.static_sb).where(
Tryout.website_id == website_id,
Tryout.tryout_id == tryout_id,
)
)
row = result.scalar_one_or_none()
if row is None:
raise ValueError(
f"Tryout {tryout_id} not found for website {website_id}"
)
rataan, sb = row
return rataan, sb, "static"
async def calculate_skewness(
db: AsyncSession,
website_id: int,
tryout_id: str,
) -> Optional[float]:
"""
Calculate skewness of NM distribution for validation.
Skewness measures the asymmetry of the probability distribution.
Values:
- Skewness ≈ 0: Symmetric distribution
- Skewness > 0: Right-skewed (tail to the right)
- Skewness < 0: Left-skewed (tail to the left)
Formula: Skewness = (n / ((n-1)(n-2))) * Σ((x - mean) / SD)³
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
Returns:
Skewness value, or None if insufficient data
"""
result = await db.execute(
select(TryoutStats).where(
TryoutStats.website_id == website_id,
TryoutStats.tryout_id == tryout_id,
)
)
stats = result.scalar_one_or_none()
if stats is None or stats.participant_count < 3:
# Need at least 3 samples for skewness calculation
return None
n = stats.participant_count
mean = stats.rataan
sd = stats.sb
if sd == 0:
return 0.0 # All values are identical
# Calculate skewness
# We need individual NM values, which we don't have in TryoutStats
# For now, return None as we need a different approach
# This would require storing all NM values or calculating on-the-fly
return None
async def validate_dynamic_normalization(
db: AsyncSession,
website_id: int,
tryout_id: str,
target_mean: float = 500.0,
target_sd: float = 100.0,
mean_tolerance: float = 5.0,
sd_tolerance: float = 5.0,
) -> Tuple[bool, dict]:
"""
Validate that dynamic normalization produces expected distribution.
Checks if calculated rataan and sb are close to target values.
Args:
db: Async database session
website_id: Website identifier
tryout_id: Tryout identifier
target_mean: Target mean (default: 500)
target_sd: Target standard deviation (default: 100)
mean_tolerance: Allowed deviation from target mean (default: 5)
sd_tolerance: Allowed deviation from target SD (default: 5)
Returns:
Tuple of (is_valid, validation_details)
validation_details contains:
- participant_count: Number of participants
- current_rataan: Current mean
- current_sb: Current standard deviation
- mean_deviation: Absolute deviation from target mean
- sd_deviation: Absolute deviation from target SD
- mean_within_tolerance: True if mean deviation < mean_tolerance
- sd_within_tolerance: True if SD deviation < sd_tolerance
- warnings: List of warning messages
- suggestions: List of suggestions
"""
# Get current stats
result = await db.execute(
select(TryoutStats).where(
TryoutStats.website_id == website_id,
TryoutStats.tryout_id == tryout_id,
)
)
stats = result.scalar_one_or_none()
if stats is None or stats.rataan is None or stats.sb is None:
return False, {
"participant_count": 0,
"current_rataan": None,
"current_sb": None,
"mean_deviation": None,
"sd_deviation": None,
"mean_within_tolerance": False,
"sd_within_tolerance": False,
"warnings": ["No statistics available for validation"],
"suggestions": ["Wait for more participants to complete sessions"],
}
# Calculate deviations
mean_deviation = abs(stats.rataan - target_mean)
sd_deviation = abs(stats.sb - target_sd)
# Check tolerance
mean_within_tolerance = mean_deviation <= mean_tolerance
sd_within_tolerance = sd_deviation <= sd_tolerance
is_valid = mean_within_tolerance and sd_within_tolerance
# Generate warnings
warnings = []
suggestions = []
if not mean_within_tolerance:
warnings.append(f"Mean deviation ({mean_deviation:.2f}) exceeds tolerance ({mean_tolerance})")
if stats.rataan > target_mean:
suggestions.append("Distribution may be right-skewed - consider checking question difficulty")
else:
suggestions.append("Distribution may be left-skewed - consider checking question difficulty")
if not sd_within_tolerance:
warnings.append(f"SD deviation ({sd_deviation:.2f}) exceeds tolerance ({sd_tolerance})")
if stats.sb < target_sd:
suggestions.append("SD too low - scores may be too tightly clustered")
else:
suggestions.append("SD too high - scores may have too much variance")
# Check for skewness
skewness = await calculate_skewness(db, website_id, tryout_id)
if skewness is not None and abs(skewness) > 0.5:
warnings.append(f"Distribution skewness ({skewness:.2f}) > 0.5 - distribution may be asymmetric")
suggestions.append("Consider using static normalization if dynamic normalization is unstable")
# Check participant count
if stats.participant_count < 100:
suggestions.append(f"Participant count ({stats.participant_count}) below recommended minimum (100)")
return is_valid, {
"participant_count": stats.participant_count,
"current_rataan": stats.rataan,
"current_sb": stats.sb,
"mean_deviation": mean_deviation,
"sd_deviation": sd_deviation,
"mean_within_tolerance": mean_within_tolerance,
"sd_within_tolerance": sd_within_tolerance,
"warnings": warnings,
"suggestions": suggestions,
}