Source code for rheojax.utils.data_quality

"""Data quality and range detection utilities.

This module provides utilities for detecting data characteristics that affect
optimization quality, such as very wide frequency ranges (mastercurves).
"""

from __future__ import annotations

import warnings

import numpy as np

from rheojax.logging import get_logger

logger = get_logger(__name__)


[docs] def detect_data_range_decades(x: np.ndarray) -> float: """Detect the range of data in decades (log10 scale). Args: x: Data array (e.g., frequency, time) Returns: Range in decades (log10(max/min)) Example: >>> freq = np.array([1e-8, 1e-6, 1e-4, 1e4]) >>> decades = detect_data_range_decades(freq) >>> print(f"{decades:.1f} decades") # 12.0 decades """ logger.debug( "Detecting data range in decades", input_length=len(x) if hasattr(x, "__len__") else 1, ) x_positive = x[x > 0] # Filter out non-positive values if len(x_positive) == 0: logger.debug("No positive values in data, returning 0 decades") return 0.0 x_min = np.min(x_positive) x_max = np.max(x_positive) # x_min and x_max are guaranteed > 0 by x_positive construction above decades = float(np.log10(x_max / x_min)) logger.debug( "Data range computed", x_min=x_min, x_max=x_max, decades=decades, ) return decades
[docs] def check_wide_frequency_range( x: np.ndarray, threshold_decades: float = 8.0, warn: bool = True, recommend_log_residuals: bool = True, ) -> dict[str, bool | float | str]: """Check if data has a very wide frequency/time range (e.g., mastercurve). Wide-range data (>8 decades) can cause optimization problems: - Optimizer bias toward high-value regions - Poor parameter recovery - Convergence to local minima Recommended solutions: - Use log-space residuals (use_log_residuals=True) - Fit to subset of data for initialization - Use multi-start optimization Args: x: Independent variable data (frequency, time, etc.) threshold_decades: Threshold for "wide range" warning (default: 8.0) warn: Whether to emit a warning if range is wide (default: True) recommend_log_residuals: Whether to recommend log-residuals in warning Returns: Dictionary with keys: - 'is_wide_range': True if range > threshold - 'decades': Actual range in decades - 'recommendation': Recommended action (or empty string) Example: >>> omega = np.logspace(-8, 4, 100) # 12 decades (mastercurve) >>> result = check_wide_frequency_range(omega) >>> if result['is_wide_range']: ... print(f"Wide range detected: {result['decades']:.1f} decades") ... print(result['recommendation']) """ logger.debug( "Checking for wide frequency range", threshold_decades=threshold_decades, warn_enabled=warn, ) decades = detect_data_range_decades(x) is_wide = decades > threshold_decades recommendation = "" if is_wide and recommend_log_residuals: recommendation = ( f"Wide frequency range ({decades:.1f} decades > {threshold_decades:.0f}) detected. " f"Recommend using log-space residuals to prevent optimization bias:\n" f" model.fit(X, y, use_log_residuals=True)\n" f"Or fit to a subset for initialization:\n" f" X_subset = X[(X > 0.01) & (X < 100)] # Middle 4 decades" ) logger.info( "Wide frequency range detected", decades=decades, threshold_decades=threshold_decades, ) if warn: warnings.warn( recommendation, UserWarning, stacklevel=3, ) else: logger.debug( "Frequency range within normal bounds", decades=decades, threshold_decades=threshold_decades, ) return { "is_wide_range": is_wide, "decades": decades, "recommendation": recommendation, }
[docs] def suggest_optimization_strategy( x: np.ndarray, y: np.ndarray, test_mode: str | None = None, ) -> dict[str, bool | str | float]: """Suggest optimization strategy based on data characteristics. Analyzes data range, complexity, and test mode to recommend: - Whether to use log-residuals - Whether to use multi-start optimization - Whether to use subset initialization Args: x: Independent variable (frequency, time, etc.) y: Dependent variable (modulus, stress, etc.) test_mode: Test mode ('oscillation', 'relaxation', 'creep') Returns: Dictionary with optimization recommendations: - 'use_log_residuals': Recommended for wide ranges - 'use_multi_start': Recommended for complex landscapes - 'use_subset_init': Recommended for very wide ranges - 'rationale': Explanation of recommendations Example: >>> omega = np.logspace(-8, 4, 100) >>> G_star = ... # Complex modulus data >>> strategy = suggest_optimization_strategy(omega, G_star, 'oscillation') >>> print(strategy['rationale']) """ logger.debug( "Analyzing data for optimization strategy", x_length=len(x) if hasattr(x, "__len__") else 1, y_length=len(y) if hasattr(y, "__len__") else 1, test_mode=test_mode, y_is_complex=np.iscomplexobj(y), ) # Check data range range_check = check_wide_frequency_range(x, warn=False) decades: float = range_check["decades"] # type: ignore[assignment] # Initialize recommendations use_log_residuals = False use_multi_start = False use_subset_init = False rationale_parts = [] # Rule 1: Very wide range (>10 decades) - mastercurve if decades > 10: use_log_residuals = True use_subset_init = True use_multi_start = True rationale_parts.append( f"Very wide range ({decades:.1f} decades): Using log-residuals, " f"subset initialization, and multi-start optimization for robustness." ) logger.debug( "Applied very wide range strategy", decades=decades, rule="rule_1_very_wide", ) # Rule 2: Wide range (8-10 decades) elif decades > 8: use_log_residuals = True use_multi_start = True rationale_parts.append( f"Wide range ({decades:.1f} decades): Using log-residuals and " f"multi-start optimization." ) logger.debug( "Applied wide range strategy", decades=decades, rule="rule_2_wide", ) # Rule 3: Moderate range (5-8 decades) elif decades > 5: use_log_residuals = True rationale_parts.append( f"Moderate range ({decades:.1f} decades): Using log-residuals " f"to balance frequency regions." ) logger.debug( "Applied moderate range strategy", decades=decades, rule="rule_3_moderate", ) # Rule 4: Oscillation mode with complex data if test_mode == "oscillation" and np.iscomplexobj(y): if decades > 6 and not use_log_residuals: use_log_residuals = True rationale_parts.append( "Oscillation mode with complex modulus: Using log-residuals." ) logger.debug( "Applied oscillation mode strategy", decades=decades, rule="rule_4_oscillation_complex", ) # Default case if not rationale_parts: rationale_parts.append( f"Standard range ({decades:.1f} decades): Using default linear residuals." ) logger.debug( "Applied standard strategy", decades=decades, rule="default", ) strategy: dict[str, bool | str | float] = { "use_log_residuals": use_log_residuals, "use_multi_start": use_multi_start, "use_subset_init": use_subset_init, "decades": decades, "rationale": " ".join(rationale_parts), } logger.info( "Optimization strategy suggested", decades=decades, use_log_residuals=use_log_residuals, use_multi_start=use_multi_start, use_subset_init=use_subset_init, ) return strategy
[docs] def check_nan_inf( data: np.ndarray, label: str = "data", ) -> dict[str, object]: """Check for NaN/Inf values and return a diagnostic dictionary. Args: data: Array to inspect (any shape; will be flattened internally). label: Human-readable name for this array used in the returned dict. Returns: Dictionary with keys: - 'label': The provided label string. - 'n_nan': Number of NaN values. - 'n_inf': Number of Inf values (±∞). - 'has_issues': True if any NaN or Inf is present. - 'fraction_clean': Fraction of finite values in [0, 1]. Example: >>> arr = np.array([1.0, np.nan, np.inf, 2.0]) >>> result = check_nan_inf(arr, label="G_star") >>> result['n_nan'] 1 >>> result['has_issues'] True """ flat = np.asarray(data).ravel() n_nan = int(np.sum(np.isnan(flat))) n_inf = int(np.sum(np.isinf(flat))) total = max(len(flat), 1) result = { "label": label, "n_nan": n_nan, "n_inf": n_inf, "has_issues": n_nan > 0 or n_inf > 0, "fraction_clean": 1.0 - (n_nan + n_inf) / total, } if result["has_issues"]: logger.info( "Data quality issue detected", label=label, n_nan=n_nan, n_inf=n_inf, fraction_clean=result["fraction_clean"], ) else: logger.debug("Data quality OK", label=label, n_points=total) return result
[docs] def check_monotonicity( x: np.ndarray, threshold: float = 0.95, ) -> dict[str, object]: """Check whether an array is approximately monotonic. An array is considered monotonic if at least *threshold* fraction of consecutive differences share the same sign. Args: x: 1-D array to check. threshold: Minimum fraction of steps that must be consistently increasing or decreasing to classify as monotonic (default: 0.95). Returns: Dictionary with keys: - 'is_monotonic': True if the dominant direction exceeds threshold. - 'direction': 'increasing', 'decreasing', 'constant', or 'mixed'. - 'fraction': Fraction of steps in the dominant direction. Example: >>> x = np.array([1.0, 2.0, 3.0, 2.9, 4.0]) >>> result = check_monotonicity(x, threshold=0.95) >>> result['direction'] 'increasing' """ x = np.asarray(x) if len(x) < 2: return {"is_monotonic": True, "direction": "constant", "fraction": 1.0} diffs = np.diff(x) n_total = len(diffs) n_inc = int(np.sum(diffs > 0)) n_dec = int(np.sum(diffs < 0)) frac_inc = n_inc / n_total frac_dec = n_dec / n_total if frac_inc >= threshold: result: dict[str, object] = { "is_monotonic": True, "direction": "increasing", "fraction": frac_inc, } elif frac_dec >= threshold: result = { "is_monotonic": True, "direction": "decreasing", "fraction": frac_dec, } else: result = { "is_monotonic": False, "direction": "mixed", "fraction": max(frac_inc, frac_dec), } logger.debug( "Monotonicity check", direction=result["direction"], is_monotonic=result["is_monotonic"], fraction=result["fraction"], n_total=n_total, ) return result
__all__ = [ "detect_data_range_decades", "check_wide_frequency_range", "suggest_optimization_strategy", "check_nan_inf", "check_monotonicity", ]