first commit
This commit is contained in:
538
app/services/normalization.py
Normal file
538
app/services/normalization.py
Normal file
@@ -0,0 +1,538 @@
|
||||
"""
|
||||
Dynamic Normalization Service.
|
||||
|
||||
Implements dynamic normalization with real-time calculation of rataan and SB
|
||||
for each tryout. Supports multiple normalization modes:
|
||||
- Static: Use hardcoded rataan/SB from config
|
||||
- Dynamic: Calculate rataan/SB from participant NM scores in real-time
|
||||
- Hybrid: Use static until threshold reached, then switch to dynamic
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
from datetime import datetime, timezone
|
||||
from typing import Literal, Optional, Tuple
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.tryout import Tryout
|
||||
from app.models.tryout_stats import TryoutStats
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def calculate_dynamic_stats(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
) -> Tuple[Optional[float], Optional[float]]:
|
||||
"""
|
||||
Calculate current dynamic stats (rataan and SB) from TryoutStats.
|
||||
|
||||
Fetches current TryoutStats for this (tryout_id, website_id) pair
|
||||
and returns the calculated rataan and SB values.
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (rataan, sb), both None if no stats exist
|
||||
"""
|
||||
result = await db.execute(
|
||||
select(TryoutStats).where(
|
||||
TryoutStats.website_id == website_id,
|
||||
TryoutStats.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
stats = result.scalar_one_or_none()
|
||||
|
||||
if stats is None:
|
||||
return None, None
|
||||
|
||||
return stats.rataan, stats.sb
|
||||
|
||||
|
||||
async def update_dynamic_normalization(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
nm: int,
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Update dynamic normalization with new NM score.
|
||||
|
||||
Fetches current TryoutStats and incrementally updates it with the new NM:
|
||||
- Increments participant_count by 1
|
||||
- Adds NM to total_nm_sum
|
||||
- Adds NM² to total_nm_sq_sum
|
||||
- Recalculates rataan and sb
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
nm: Nilai Mentah (raw score) to add
|
||||
|
||||
Returns:
|
||||
Tuple of updated (rataan, sb)
|
||||
|
||||
Raises:
|
||||
ValueError: If nm is out of valid range [0, 1000]
|
||||
"""
|
||||
if not 0 <= nm <= 1000:
|
||||
raise ValueError(f"nm must be in range [0, 1000], got {nm}")
|
||||
|
||||
result = await db.execute(
|
||||
select(TryoutStats).where(
|
||||
TryoutStats.website_id == website_id,
|
||||
TryoutStats.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
stats = result.scalar_one_or_none()
|
||||
|
||||
if stats is None:
|
||||
# Initialize new stats record
|
||||
stats = TryoutStats(
|
||||
website_id=website_id,
|
||||
tryout_id=tryout_id,
|
||||
participant_count=1,
|
||||
total_nm_sum=float(nm),
|
||||
total_nm_sq_sum=float(nm * nm),
|
||||
rataan=float(nm),
|
||||
sb=0.0, # SD is 0 for single data point
|
||||
min_nm=nm,
|
||||
max_nm=nm,
|
||||
last_calculated=datetime.now(timezone.utc),
|
||||
)
|
||||
db.add(stats)
|
||||
else:
|
||||
# Incrementally update existing stats
|
||||
stats.participant_count += 1
|
||||
stats.total_nm_sum += nm
|
||||
stats.total_nm_sq_sum += nm * nm
|
||||
|
||||
# Update min/max
|
||||
if stats.min_nm is None or nm < stats.min_nm:
|
||||
stats.min_nm = nm
|
||||
if stats.max_nm is None or nm > stats.max_nm:
|
||||
stats.max_nm = nm
|
||||
|
||||
# Recalculate mean and SD
|
||||
n = stats.participant_count
|
||||
sum_nm = stats.total_nm_sum
|
||||
sum_nm_sq = stats.total_nm_sq_sum
|
||||
|
||||
# Mean = Σ NM / n
|
||||
mean = sum_nm / n
|
||||
stats.rataan = mean
|
||||
|
||||
# Variance = (Σ NM² / n) - (mean)²
|
||||
# Using population standard deviation
|
||||
if n > 1:
|
||||
variance = (sum_nm_sq / n) - (mean ** 2)
|
||||
# Clamp variance to non-negative (handles floating point errors)
|
||||
variance = max(0.0, variance)
|
||||
stats.sb = math.sqrt(variance)
|
||||
else:
|
||||
stats.sb = 0.0
|
||||
|
||||
stats.last_calculated = datetime.now(timezone.utc)
|
||||
|
||||
await db.flush()
|
||||
|
||||
logger.info(
|
||||
f"Updated dynamic normalization for tryout {tryout_id}, "
|
||||
f"website {website_id}: participant_count={stats.participant_count}, "
|
||||
f"rataan={stats.rataan:.2f}, sb={stats.sb:.2f}"
|
||||
)
|
||||
|
||||
# rataan and sb are always set by this function
|
||||
assert stats.rataan is not None
|
||||
assert stats.sb is not None
|
||||
return stats.rataan, stats.sb
|
||||
|
||||
|
||||
def apply_normalization(
|
||||
nm: int,
|
||||
rataan: float,
|
||||
sb: float,
|
||||
) -> int:
|
||||
"""
|
||||
Apply normalization to NM to get NN (Nilai Nasional).
|
||||
|
||||
Formula: NN = 500 + 100 × ((NM - Rataan) / SB)
|
||||
|
||||
Normalizes scores to mean=500, SD=100 distribution.
|
||||
|
||||
Args:
|
||||
nm: Nilai Mentah (raw score) in range [0, 1000]
|
||||
rataan: Mean of NM scores
|
||||
sb: Standard deviation of NM scores
|
||||
|
||||
Returns:
|
||||
NN (normalized score) in range [0, 1000]
|
||||
|
||||
Raises:
|
||||
ValueError: If nm is out of range or sb is invalid
|
||||
"""
|
||||
if not 0 <= nm <= 1000:
|
||||
raise ValueError(f"nm must be in range [0, 1000], got {nm}")
|
||||
if sb <= 0:
|
||||
# If SD is 0 or negative, return default normalized score
|
||||
# This handles edge case where all scores are identical
|
||||
return 500
|
||||
|
||||
# Calculate normalized score
|
||||
z_score = (nm - rataan) / sb
|
||||
nn = 500 + 100 * z_score
|
||||
|
||||
# Round to integer and clamp to valid range [0, 1000]
|
||||
nn_int = round(nn)
|
||||
return max(0, min(1000, nn_int))
|
||||
|
||||
|
||||
async def get_normalization_mode(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
) -> Literal["static", "dynamic", "hybrid"]:
|
||||
"""
|
||||
Get the current normalization mode for a tryout.
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
|
||||
Returns:
|
||||
Normalization mode: "static", "dynamic", or "hybrid"
|
||||
|
||||
Raises:
|
||||
ValueError: If tryout not found
|
||||
"""
|
||||
result = await db.execute(
|
||||
select(Tryout).where(
|
||||
Tryout.website_id == website_id,
|
||||
Tryout.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
tryout = result.scalar_one_or_none()
|
||||
|
||||
if tryout is None:
|
||||
raise ValueError(
|
||||
f"Tryout {tryout_id} not found for website {website_id}"
|
||||
)
|
||||
|
||||
return tryout.normalization_mode
|
||||
|
||||
|
||||
async def check_threshold_for_dynamic(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
) -> bool:
|
||||
"""
|
||||
Check if participant count meets threshold for dynamic normalization.
|
||||
|
||||
Compares current participant_count with min_sample_for_dynamic from config.
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
|
||||
Returns:
|
||||
True if participant_count >= min_sample_for_dynamic, else False
|
||||
"""
|
||||
# Fetch current TryoutStats
|
||||
stats_result = await db.execute(
|
||||
select(TryoutStats).where(
|
||||
TryoutStats.website_id == website_id,
|
||||
TryoutStats.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
stats = stats_result.scalar_one_or_none()
|
||||
current_participant_count = stats.participant_count if stats else 0
|
||||
|
||||
# Fetch min_sample_for_dynamic from config
|
||||
tryout_result = await db.execute(
|
||||
select(Tryout.min_sample_for_dynamic).where(
|
||||
Tryout.website_id == website_id,
|
||||
Tryout.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
min_sample = tryout_result.scalar_one_or_none()
|
||||
|
||||
if min_sample is None:
|
||||
# Default to 100 if not configured
|
||||
min_sample = 100
|
||||
|
||||
return current_participant_count >= min_sample
|
||||
|
||||
|
||||
async def get_normalization_params(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
) -> Tuple[float, float, Literal["static", "dynamic"]]:
|
||||
"""
|
||||
Get normalization parameters (rataan, sb) based on current mode.
|
||||
|
||||
Determines which normalization parameters to use:
|
||||
- Static mode: Use config.static_rataan and config.static_sb
|
||||
- Dynamic mode: Use calculated rataan and sb from TryoutStats
|
||||
- Hybrid mode: Use static until threshold reached, then dynamic
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (rataan, sb, mode_used)
|
||||
|
||||
Raises:
|
||||
ValueError: If tryout not found or dynamic stats unavailable
|
||||
"""
|
||||
# Get normalization mode
|
||||
mode = await get_normalization_mode(db, website_id, tryout_id)
|
||||
|
||||
if mode == "static":
|
||||
# Use static values from config
|
||||
result = await db.execute(
|
||||
select(Tryout.static_rataan, Tryout.static_sb).where(
|
||||
Tryout.website_id == website_id,
|
||||
Tryout.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
row = result.scalar_one_or_none()
|
||||
|
||||
if row is None:
|
||||
raise ValueError(
|
||||
f"Tryout {tryout_id} not found for website {website_id}"
|
||||
)
|
||||
|
||||
rataan, sb = row
|
||||
return rataan, sb, "static"
|
||||
|
||||
elif mode == "dynamic":
|
||||
# Use dynamic values from stats
|
||||
rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
|
||||
|
||||
if rataan is None or sb is None:
|
||||
raise ValueError(
|
||||
f"Dynamic normalization not available for tryout {tryout_id}. "
|
||||
"No stats have been calculated yet."
|
||||
)
|
||||
|
||||
if sb == 0:
|
||||
logger.warning(
|
||||
f"Standard deviation is 0 for tryout {tryout_id}. "
|
||||
"All NM scores are identical."
|
||||
)
|
||||
|
||||
return rataan, sb, "dynamic"
|
||||
|
||||
else: # hybrid
|
||||
# Check threshold
|
||||
threshold_met = await check_threshold_for_dynamic(db, website_id, tryout_id)
|
||||
|
||||
if threshold_met:
|
||||
# Use dynamic values
|
||||
rataan, sb = await calculate_dynamic_stats(db, website_id, tryout_id)
|
||||
|
||||
if rataan is None or sb is None:
|
||||
# Fallback to static if dynamic not available
|
||||
result = await db.execute(
|
||||
select(Tryout.static_rataan, Tryout.static_sb).where(
|
||||
Tryout.website_id == website_id,
|
||||
Tryout.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
row = result.scalar_one_or_none()
|
||||
if row is None:
|
||||
raise ValueError(
|
||||
f"Tryout {tryout_id} not found for website {website_id}"
|
||||
)
|
||||
rataan, sb = row
|
||||
return rataan, sb, "static"
|
||||
|
||||
return rataan, sb, "dynamic"
|
||||
else:
|
||||
# Use static values
|
||||
result = await db.execute(
|
||||
select(Tryout.static_rataan, Tryout.static_sb).where(
|
||||
Tryout.website_id == website_id,
|
||||
Tryout.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
row = result.scalar_one_or_none()
|
||||
if row is None:
|
||||
raise ValueError(
|
||||
f"Tryout {tryout_id} not found for website {website_id}"
|
||||
)
|
||||
rataan, sb = row
|
||||
return rataan, sb, "static"
|
||||
|
||||
|
||||
async def calculate_skewness(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
) -> Optional[float]:
|
||||
"""
|
||||
Calculate skewness of NM distribution for validation.
|
||||
|
||||
Skewness measures the asymmetry of the probability distribution.
|
||||
Values:
|
||||
- Skewness ≈ 0: Symmetric distribution
|
||||
- Skewness > 0: Right-skewed (tail to the right)
|
||||
- Skewness < 0: Left-skewed (tail to the left)
|
||||
|
||||
Formula: Skewness = (n / ((n-1)(n-2))) * Σ((x - mean) / SD)³
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
|
||||
Returns:
|
||||
Skewness value, or None if insufficient data
|
||||
"""
|
||||
result = await db.execute(
|
||||
select(TryoutStats).where(
|
||||
TryoutStats.website_id == website_id,
|
||||
TryoutStats.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
stats = result.scalar_one_or_none()
|
||||
|
||||
if stats is None or stats.participant_count < 3:
|
||||
# Need at least 3 samples for skewness calculation
|
||||
return None
|
||||
|
||||
n = stats.participant_count
|
||||
mean = stats.rataan
|
||||
sd = stats.sb
|
||||
|
||||
if sd == 0:
|
||||
return 0.0 # All values are identical
|
||||
|
||||
# Calculate skewness
|
||||
# We need individual NM values, which we don't have in TryoutStats
|
||||
# For now, return None as we need a different approach
|
||||
# This would require storing all NM values or calculating on-the-fly
|
||||
return None
|
||||
|
||||
|
||||
async def validate_dynamic_normalization(
|
||||
db: AsyncSession,
|
||||
website_id: int,
|
||||
tryout_id: str,
|
||||
target_mean: float = 500.0,
|
||||
target_sd: float = 100.0,
|
||||
mean_tolerance: float = 5.0,
|
||||
sd_tolerance: float = 5.0,
|
||||
) -> Tuple[bool, dict]:
|
||||
"""
|
||||
Validate that dynamic normalization produces expected distribution.
|
||||
|
||||
Checks if calculated rataan and sb are close to target values.
|
||||
|
||||
Args:
|
||||
db: Async database session
|
||||
website_id: Website identifier
|
||||
tryout_id: Tryout identifier
|
||||
target_mean: Target mean (default: 500)
|
||||
target_sd: Target standard deviation (default: 100)
|
||||
mean_tolerance: Allowed deviation from target mean (default: 5)
|
||||
sd_tolerance: Allowed deviation from target SD (default: 5)
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, validation_details)
|
||||
|
||||
validation_details contains:
|
||||
- participant_count: Number of participants
|
||||
- current_rataan: Current mean
|
||||
- current_sb: Current standard deviation
|
||||
- mean_deviation: Absolute deviation from target mean
|
||||
- sd_deviation: Absolute deviation from target SD
|
||||
- mean_within_tolerance: True if mean deviation < mean_tolerance
|
||||
- sd_within_tolerance: True if SD deviation < sd_tolerance
|
||||
- warnings: List of warning messages
|
||||
- suggestions: List of suggestions
|
||||
"""
|
||||
# Get current stats
|
||||
result = await db.execute(
|
||||
select(TryoutStats).where(
|
||||
TryoutStats.website_id == website_id,
|
||||
TryoutStats.tryout_id == tryout_id,
|
||||
)
|
||||
)
|
||||
stats = result.scalar_one_or_none()
|
||||
|
||||
if stats is None or stats.rataan is None or stats.sb is None:
|
||||
return False, {
|
||||
"participant_count": 0,
|
||||
"current_rataan": None,
|
||||
"current_sb": None,
|
||||
"mean_deviation": None,
|
||||
"sd_deviation": None,
|
||||
"mean_within_tolerance": False,
|
||||
"sd_within_tolerance": False,
|
||||
"warnings": ["No statistics available for validation"],
|
||||
"suggestions": ["Wait for more participants to complete sessions"],
|
||||
}
|
||||
|
||||
# Calculate deviations
|
||||
mean_deviation = abs(stats.rataan - target_mean)
|
||||
sd_deviation = abs(stats.sb - target_sd)
|
||||
|
||||
# Check tolerance
|
||||
mean_within_tolerance = mean_deviation <= mean_tolerance
|
||||
sd_within_tolerance = sd_deviation <= sd_tolerance
|
||||
|
||||
is_valid = mean_within_tolerance and sd_within_tolerance
|
||||
|
||||
# Generate warnings
|
||||
warnings = []
|
||||
suggestions = []
|
||||
|
||||
if not mean_within_tolerance:
|
||||
warnings.append(f"Mean deviation ({mean_deviation:.2f}) exceeds tolerance ({mean_tolerance})")
|
||||
if stats.rataan > target_mean:
|
||||
suggestions.append("Distribution may be right-skewed - consider checking question difficulty")
|
||||
else:
|
||||
suggestions.append("Distribution may be left-skewed - consider checking question difficulty")
|
||||
|
||||
if not sd_within_tolerance:
|
||||
warnings.append(f"SD deviation ({sd_deviation:.2f}) exceeds tolerance ({sd_tolerance})")
|
||||
if stats.sb < target_sd:
|
||||
suggestions.append("SD too low - scores may be too tightly clustered")
|
||||
else:
|
||||
suggestions.append("SD too high - scores may have too much variance")
|
||||
|
||||
# Check for skewness
|
||||
skewness = await calculate_skewness(db, website_id, tryout_id)
|
||||
if skewness is not None and abs(skewness) > 0.5:
|
||||
warnings.append(f"Distribution skewness ({skewness:.2f}) > 0.5 - distribution may be asymmetric")
|
||||
suggestions.append("Consider using static normalization if dynamic normalization is unstable")
|
||||
|
||||
# Check participant count
|
||||
if stats.participant_count < 100:
|
||||
suggestions.append(f"Participant count ({stats.participant_count}) below recommended minimum (100)")
|
||||
|
||||
return is_valid, {
|
||||
"participant_count": stats.participant_count,
|
||||
"current_rataan": stats.rataan,
|
||||
"current_sb": stats.sb,
|
||||
"mean_deviation": mean_deviation,
|
||||
"sd_deviation": sd_deviation,
|
||||
"mean_within_tolerance": mean_within_tolerance,
|
||||
"sd_within_tolerance": sd_within_tolerance,
|
||||
"warnings": warnings,
|
||||
"suggestions": suggestions,
|
||||
}
|
||||
Reference in New Issue
Block a user