Source code for rheojax.pipeline.base

"""Base pipeline class for fluent API workflows.

This module provides the core Pipeline class that enables intuitive method
chaining for common rheological analysis workflows.

Example:
    >>> from rheojax.pipeline import Pipeline
    >>> pipeline = Pipeline()
    >>> result = (pipeline
    ...     .load('data.csv')
    ...     .transform('smooth', window_size=5)
    ...     .fit('maxwell')
    ...     .plot()
    ...     .save('result.hdf5')
    ...     .get_result())
"""

from __future__ import annotations

import copy
import uuid
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Any, Self

import numpy as np

from rheojax.core._validation import reject_removed_options
from rheojax.core.base import BaseModel, BaseTransform
from rheojax.core.data import RheoData
from rheojax.core.jax_config import safe_import_jax
from rheojax.core.registry import ModelRegistry, TransformRegistry
from rheojax.logging import get_logger, log_pipeline_stage

# Safe JAX import (enforces float64)
jax, jnp = safe_import_jax()

# Module-level logger
logger = get_logger(__name__)


def _is_jax_array(x: Any) -> bool:
    """Robust check for JAX arrays across JAX versions."""
    return hasattr(x, "devices") and not isinstance(x, np.ndarray)


class _PipelineState:
    """Shared instance-attribute/method declarations for Pipeline's mixins.

    Purely a static-typing aid: the actual attributes are set by
    Pipeline.__init__ and the stubbed methods are implemented in Pipeline
    itself. Subclassed (never instantiated directly) by _PipelineIO and
    _PipelinePlotting so mypy knows what `self` provides in each mixin,
    without duplicating the real implementations.
    """

    if TYPE_CHECKING:
        data: RheoData | None
        steps: list[tuple[str, Any]]
        history: list[tuple[Any, ...]]
        _last_model: BaseModel | None
        _last_fit_result: Any
        _last_bayesian_result: Any
        _transform_results: dict[str, tuple[Any, RheoData | None]]
        _last_transform_name: str | None
        _current_figure: Any
        _diagnostic_results: Any
        _last_comparison: Any
        _id: str

        def predict(
            self, model: BaseModel | None = ..., X: np.ndarray | None = ...
        ) -> RheoData: ...
        def get_fit_result(self) -> Any: ...
        def _apply_test_mode_metadata(
            self, data: RheoData | None, mode: str | None
        ) -> None: ...


class _PipelineIO(_PipelineState):
    """File I/O collaborator for Pipeline: load, save, export.

    Split out of the former monolithic Pipeline class (ASSESSMENT.md
    Technical Debt #5) to separate file-format concerns from the
    fit/predict/model-management core. Composed into Pipeline via multiple
    inheritance -- methods here operate on attributes owned by
    Pipeline.__init__ (self.data, self.steps, self.history, self._id).
    """

    def load(
        self,
        file_path: str | Path,
        format: str = "auto",
        *,
        test_mode: str | None = None,
        initial_test_mode: str | None = None,
        **kwargs,
    ) -> Self:
        """Load data from file.

        Args:
            file_path: Path to data file
            format: File format ('auto', 'csv', 'excel', 'trios', 'hdf5')
            test_mode: Optional rheological mode metadata to attach to the
                resulting RheoData (e.g., 'relaxation', 'creep', 'oscillation')
            initial_test_mode: Backwards-compatible alias for test_mode
            **kwargs: Additional arguments passed to reader

        Returns:
            self for method chaining

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file format not recognized

        Example:
            >>> pipeline = Pipeline().load('data.csv', x_col='time', y_col='stress')
        """
        from rheojax.io import auto_load

        path = Path(file_path)

        explicit_mode = test_mode if test_mode is not None else initial_test_mode

        with log_pipeline_stage(
            logger, "load", pipeline_id=self._id, file_path=str(path), format=format
        ) as ctx:
            try:
                if format == "auto":
                    result = auto_load(path, **kwargs)
                else:
                    # Format-specific loading
                    if format == "csv":
                        from rheojax.io import load_csv

                        result = load_csv(path, **kwargs)
                    elif format == "excel":
                        from rheojax.io import load_excel

                        result = load_excel(path, **kwargs)
                    elif format == "trios":
                        from rheojax.io import load_trios

                        result = load_trios(path, **kwargs)
                    elif format == "hdf5":
                        from rheojax.io import load_hdf5

                        result = load_hdf5(path, **kwargs)
                    elif format == "npz":
                        from rheojax.io.writers.npz_writer import load_npz

                        result = load_npz(path)
                    else:
                        raise ValueError(f"Unknown format: {format}")

                # Handle multiple segments (for TRIOS)
                if isinstance(result, list):
                    if len(result) == 1:
                        self.data = result[0]
                    else:
                        warnings.warn(
                            f"Loaded {len(result)} segments. Using first segment.",
                            stacklevel=2,
                        )
                        self.data = result[0]
                    ctx["n_segments"] = len(result)
                else:
                    self.data = result

                self._apply_test_mode_metadata(self.data, explicit_mode)
                ctx["n_points"] = (
                    len(self.data.x)
                    if self.data is not None and self.data.x is not None
                    else 0
                )
                ctx["test_mode"] = explicit_mode

            except Exception as e:
                logger.error(
                    "Failed to load data",
                    pipeline_id=self._id,
                    file_path=str(path),
                    format=format,
                    error=str(e),
                    exc_info=True,
                )
                raise

        self.history.append(("load", str(path), format))
        return self

    def save(self, file_path: str | Path, format: str = "hdf5", **kwargs) -> Self:
        """Save current data to file.

        Args:
            file_path: Output file path
            format: Output format ('hdf5', 'excel', 'csv')
            **kwargs: Additional arguments passed to writer

        Returns:
            self for method chaining

        Example:
            >>> pipeline.save('output.hdf5')
        """
        if self.data is None:
            raise ValueError("No data to save. Call load() first.")

        path = Path(file_path)

        # R12-E-007: include fitted model parameters in data metadata before saving
        if self.steps:
            _last_fit_steps = [s for s in self.steps if s[0] in ("fit", "fit_nlsq")]
            if _last_fit_steps:
                _fit_model = _last_fit_steps[-1][1]
                if hasattr(_fit_model, "parameters"):
                    # See _apply_test_mode_metadata: RheoData.metadata is
                    # never None post-construction, so this is unreachable
                    # per the type checker; kept as defense-in-depth.
                    if self.data.metadata is None:
                        self.data.metadata = {}  # type: ignore[unreachable]
                    for _pname in _fit_model.parameters.keys():
                        try:
                            self.data.metadata[f"fitted_{_pname}"] = float(
                                _fit_model.parameters.get_value(_pname)
                            )
                        except (TypeError, ValueError):
                            pass
                    self.data.metadata["fitted_model"] = type(_fit_model).__name__

        with log_pipeline_stage(
            logger,
            "save",
            pipeline_id=self._id,
            file_path=str(path),
            format=format,
        ) as ctx:
            try:
                if format == "hdf5":
                    from rheojax.io import save_hdf5

                    save_hdf5(self.data, path, **kwargs)
                elif format == "excel":
                    from rheojax.io import save_excel

                    # R13-PIPE-XLS-001: The "parameters" key should contain
                    # actual model parameters (name → value), not metadata
                    # labels. Data metadata (units, domain) goes into a
                    # separate "fit_quality" dict for the Fit Quality sheet.
                    parameters: dict[str, Any] = {}
                    _last_fit_steps = [
                        s for s in self.steps if s[0] in ("fit", "fit_nlsq")
                    ]
                    if _last_fit_steps:
                        _fit_model = _last_fit_steps[-1][1]
                        if hasattr(_fit_model, "parameters"):
                            for _pname in _fit_model.parameters.keys():
                                try:
                                    parameters[_pname] = float(
                                        _fit_model.parameters.get_value(_pname)
                                    )
                                except (TypeError, ValueError):
                                    pass
                            parameters["model"] = type(_fit_model).__name__

                    fit_quality: dict[str, Any] = {}
                    if self.data.x_units:
                        fit_quality["x_units"] = self.data.x_units
                    if self.data.y_units:
                        fit_quality["y_units"] = self.data.y_units
                    if self.data.domain:
                        fit_quality["domain"] = self.data.domain

                    excel_payload: dict[str, Any] = {
                        "x": np.array(self.data.x),
                        "predictions": np.array(self.data.y),
                    }
                    if parameters:
                        excel_payload["parameters"] = parameters
                    if fit_quality:
                        excel_payload["fit_quality"] = fit_quality
                    save_excel(excel_payload, path, **kwargs)
                elif format == "csv":
                    # R13-PIPE-CSV-001: Handle complex and 2D y arrays
                    # in CSV export. Complex y is split into real/imag
                    # columns; 2D y is split into numbered columns.
                    import pandas as pd

                    x_arr = np.array(self.data.x)
                    y_arr = np.array(self.data.y)
                    if np.iscomplexobj(y_arr):
                        df = pd.DataFrame(
                            {
                                "x": x_arr,
                                "y_real": np.real(y_arr),
                                "y_imag": np.imag(y_arr),
                            }
                        )
                    elif y_arr.ndim == 2:
                        cols: dict[str, Any] = {"x": x_arr}
                        for ci in range(y_arr.shape[1]):
                            cols[f"y_{ci}"] = y_arr[:, ci]
                        df = pd.DataFrame(cols)
                    else:
                        df = pd.DataFrame({"x": x_arr, "y": y_arr})
                    df.to_csv(path, index=False, **kwargs)
                else:
                    raise ValueError(f"Unknown format: {format}")

                ctx["n_points"] = (
                    len(self.data.x) if self.data.x is not None else 0
                )
            except Exception as e:
                logger.error(
                    "Failed to save data",
                    pipeline_id=self._id,
                    file_path=str(path),
                    format=format,
                    error=str(e),
                    exc_info=True,
                )
                raise

        self.history.append(("save", str(path), format))
        return self

    def export(
        self,
        output: str | Path,
        format: str = "auto",
        *,
        include_data: bool = True,
        include_figures: bool = True,
        include_diagnostics: bool = True,
        figure_formats: tuple[str, ...] = ("pdf", "png"),
        figure_dpi: int = 300,
        **kwargs,
    ) -> Self:
        """Export the full analysis to a directory or file.

        This bundles data, parameters, statistics, figures, transform results,
        and Bayesian diagnostics into a single export.

        Args:
            output: Output path. If a directory (no extension or trailing /),
                exports as structured directory. If .xlsx, exports Excel.
            format: Export format ('auto', 'directory', 'excel').
                'auto' infers from the output path extension.
            include_data: Save raw and transformed data files.
            include_figures: Save generated matplotlib figures.
            include_diagnostics: Save MCMC diagnostic plots.
            figure_formats: Formats for figure files (default: ('pdf', 'png')).
            figure_dpi: Resolution for raster figures (default: 300).
            **kwargs: Additional arguments forwarded to the exporter.

        Returns:
            self for method chaining

        Example:
            >>> pipeline.load('data.csv').fit('maxwell').plot_fit().export('./results')
            >>> pipeline.export('report.xlsx')
        """
        from rheojax.io.analysis_exporter import AnalysisExporter

        output_path = Path(output)
        exporter = AnalysisExporter(
            figure_formats=figure_formats,
            figure_dpi=figure_dpi,
        )

        # Determine format
        if format == "auto":
            if output_path.suffix.lower() == ".xlsx":
                format = "excel"
            else:
                format = "directory"

        with log_pipeline_stage(
            logger,
            "export",
            pipeline_id=self._id,
            output=str(output_path),
            format=format,
        ) as ctx:
            try:
                if format == "directory":
                    exporter.export_directory(
                        self,
                        output_path,
                        include_data=include_data,
                        include_figures=include_figures,
                        include_diagnostics=include_diagnostics,
                        **kwargs,
                    )
                elif format == "excel":
                    exporter.export_excel(
                        self,
                        output_path,
                        include_plots=include_figures,
                        **kwargs,
                    )
                else:
                    raise ValueError(
                        f"Unknown export format: {format}. Use 'directory' or 'excel'."
                    )
                ctx["format"] = format
            except Exception as e:
                logger.error(
                    "Export failed",
                    pipeline_id=self._id,
                    output=str(output_path),
                    error=str(e),
                    exc_info=True,
                )
                raise

        self.steps.append(
            ("export", {"output_path": str(output_path), "format": format})
        )
        self.history.append(("export", str(output_path), format))
        return self


class _PipelinePlotting(_PipelineState):
    """Plotting collaborator for Pipeline: plot, save_figure, plot_*.

    Split out of the former monolithic Pipeline class (ASSESSMENT.md
    Technical Debt #5). All methods here share self._current_figure and
    call back into Pipeline core methods (e.g. plot_fit calls
    self.get_fit_result()) -- safe because Pipeline composes this mixin,
    so `self` always has the full method/attribute set at runtime.
    """

    def plot(
        self,
        show: bool = True,
        style: str = "default",
        include_prediction: bool = False,
        **plot_kwargs,
    ) -> Self:
        """Plot current data state.

        Args:
            show: Whether to call plt.show()
            style: Plot style ('default', 'publication', 'presentation')
            include_prediction: If True and model fitted, overlay predictions
            **plot_kwargs: Additional arguments passed to plotting function

        Returns:
            self for method chaining

        Example:
            >>> pipeline.plot(style='publication')
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        with log_pipeline_stage(
            logger,
            "plot",
            pipeline_id=self._id,
            style=style,
            include_prediction=include_prediction,
        ) as ctx:
            from rheojax.visualization.plotter import plot_rheo_data

            fig, ax = plot_rheo_data(self.data, style=style, **plot_kwargs)

            # Optionally overlay predictions
            if include_prediction and self._last_model is not None:
                predictions = self.predict()
                import matplotlib.pyplot as plt

                # Get the axes (handle both single and multiple axes)
                if isinstance(ax, np.ndarray):
                    ax_plot = ax[0]
                else:
                    ax_plot = ax

                ax_plot.plot(
                    predictions.x,
                    predictions.y,
                    "--",
                    label="Model Prediction",
                    linewidth=2,
                )
                ax_plot.legend()
                ctx["prediction_overlay"] = True

            if show:
                import matplotlib.pyplot as plt

                plt.show()

            # Store figure for save_figure() method
            self._current_figure = fig

        self.history.append(("plot", style))
        return self

    def save_figure(
        self,
        filepath: str | Path,
        format: str | None = None,
        dpi: int = 300,
        **kwargs: Any,
    ) -> Self:
        """
        Save the most recent plot to file.

        Convenience method for exporting plots with publication-quality defaults.
        Wraps rheojax.visualization.plotter.save_figure() to enable fluent API chaining.

        Parameters
        ----------
        filepath : str or Path
            Output file path. Format inferred from extension if not specified.
        format : str, optional
            Output format ('pdf', 'svg', 'png', 'eps'). If None, inferred from filepath.
        dpi : int, default=300
            Resolution for raster formats (PNG).
        **kwargs : dict
            Additional arguments passed to save_figure().
            See rheojax.visualization.plotter.save_figure() for details.

        Returns
        -------
        self : Pipeline
            Returns self to enable method chaining

        Raises
        ------
        ValueError
            If no plot exists (plot() not called yet)
        ValueError
            If format cannot be inferred or is unsupported
        OSError
            If filepath directory doesn't exist

        Examples
        --------
        Basic usage with method chaining:

        >>> pipeline = Pipeline()
        >>> pipeline.load('data.csv').fit('maxwell').plot().save_figure('result.pdf')

        Save multiple formats:

        >>> pipeline.plot(style='publication')
        >>> pipeline.save_figure('figure.pdf')
        >>> pipeline.save_figure('figure.png', dpi=600)
        >>> pipeline.save_figure('figure.svg', transparent=True)

        Explicit format:

        >>> pipeline.plot().save_figure('output', format='pdf')

        See Also
        --------
        plot : Generate plot with automatic type selection
        rheojax.visualization.plotter.save_figure : Core export function

        Notes
        -----
        This method saves the most recent plot generated by plot(). If you call plot()
        multiple times, only the last figure is saved. To save multiple plots, call
        save_figure() after each plot() call.

        The figure is stored internally by plot() and retrieved by save_figure().
        """
        if self._current_figure is None:
            raise ValueError(
                "No figure to save. Call plot() before save_figure(). "
                "Example: pipeline.load('data.csv').fit('maxwell').plot().save_figure('output.pdf')"
            )

        from rheojax.visualization.plotter import save_figure

        path = Path(filepath)

        with log_pipeline_stage(
            logger,
            "save_figure",
            pipeline_id=self._id,
            file_path=str(path),
            format=format,
            dpi=dpi,
        ) as ctx:
            try:
                save_figure(
                    self._current_figure, path, format=format, dpi=dpi, **kwargs
                )
                ctx["saved"] = True
            except Exception as e:
                logger.error(
                    "Failed to save figure",
                    pipeline_id=self._id,
                    file_path=str(path),
                    error=str(e),
                    exc_info=True,
                )
                raise

        self.history.append(("save_figure", str(path)))
        return self

    def plot_fit(
        self,
        confidence: float = 0.95,
        show_residuals: bool = True,
        show_uncertainty: bool = True,
        show: bool = True,
        style: str = "default",
        **kwargs,
    ) -> Self:
        """Plot NLSQ fit with uncertainty band and residuals.

        Requires a prior call to fit(). Uses FitPlotter internally.

        Args:
            confidence: Confidence level for uncertainty band (default: 0.95).
            show_residuals: If True, add residuals subplot.
            show_uncertainty: If True and covariance available, show band.
            show: Whether to call plt.show() (default: True).
            style: Plot style ('default', 'publication', 'presentation').
            **kwargs: Additional arguments forwarded to FitPlotter.plot_nlsq().

        Returns:
            self for method chaining

        Example:
            >>> pipeline.fit('maxwell').plot_fit(confidence=0.95)
        """
        if self._last_model is None:
            raise ValueError("No model fitted. Call fit() first.")
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        from rheojax.visualization.fit_plotter import FitPlotter

        fit_result = self.get_fit_result()
        plotter = FitPlotter()

        # np.asarray is zero-copy for CPU-backed arrays (JAX or numpy)
        X = np.asarray(self.data.x)
        y = np.asarray(self.data.y)

        _meta = getattr(self.data, "metadata", None) or {}
        if "test_mode" not in kwargs:
            tm = _meta.get("test_mode")
            if tm is not None:
                kwargs["test_mode"] = tm

        fig, axes = plotter.plot_nlsq(
            X,
            y,
            fit_result,
            self._last_model,
            confidence=confidence,
            show_residuals=show_residuals,
            show_uncertainty=show_uncertainty,
            style=style,
            **kwargs,
        )

        self._current_figure = fig

        if show:
            import matplotlib.pyplot as plt

            plt.show()

        self.history.append(("plot_fit", style))
        return self

    def plot_bayesian(
        self,
        credible_level: float = 0.95,
        max_draws: int = 500,
        show_nlsq_overlay: bool = False,
        show_residuals: bool = False,
        show: bool = True,
        style: str = "default",
        **kwargs,
    ) -> Self:
        """Plot Bayesian posterior predictive with credible interval.

        Requires a prior call to fit_bayesian().

        Args:
            credible_level: Credible interval level (default: 0.95).
            max_draws: Maximum posterior draws for band computation.
            show_nlsq_overlay: If True, overlay NLSQ fit for comparison.
            show_residuals: If True, add residuals subplot.
            show: Whether to call plt.show() (default: True).
            style: Plot style.
            **kwargs: Additional arguments forwarded to FitPlotter.plot_bayesian().

        Returns:
            self for method chaining

        Example:
            >>> pipeline.fit('maxwell').fit_bayesian(seed=42).plot_bayesian()
        """
        if self._last_bayesian_result is None:
            raise ValueError("No Bayesian result available. Call fit_bayesian() first.")
        if self._last_model is None:
            raise ValueError("No model available.")
        if self.data is None:
            raise ValueError("No data loaded.")

        from rheojax.visualization.fit_plotter import FitPlotter

        plotter = FitPlotter()
        # np.asarray is zero-copy for CPU-backed arrays (JAX or numpy)
        X = np.asarray(self.data.x)
        y = np.asarray(self.data.y)

        # Forward metadata
        _meta = getattr(self.data, "metadata", None) or {}
        if "test_mode" not in kwargs:
            tm = _meta.get("test_mode")
            if tm is not None:
                kwargs["test_mode"] = tm

        fit_result = None
        if show_nlsq_overlay:
            try:
                fit_result = self.get_fit_result()
            except ValueError:
                pass

        fig, axes = plotter.plot_bayesian(
            X,
            y,
            self._last_bayesian_result,
            self._last_model,
            credible_level=credible_level,
            max_draws=max_draws,
            show_nlsq_overlay=show_nlsq_overlay,
            fit_result=fit_result,
            show_residuals=show_residuals,
            style=style,
            **kwargs,
        )

        self._current_figure = fig

        if show:
            import matplotlib.pyplot as plt

            plt.show()

        self.history.append(("plot_bayesian", style))
        return self

    def plot_diagnostics(
        self,
        output_dir: str | Path | None = None,
        style: str = "default",
        prefix: str = "mcmc",
        formats: tuple[str, ...] = ("pdf", "png"),
        dpi: int = 300,
        **kwargs,
    ) -> Self:
        """Generate ArviZ MCMC diagnostic suite (6 plots).

        Requires a prior call to fit_bayesian().

        Args:
            output_dir: Directory for saving plots. If None, displays only.
            style: Plot style.
            prefix: Filename prefix for saved plots.
            formats: Output formats (default: ('pdf', 'png')).
            dpi: Resolution for raster formats.
            **kwargs: Additional arguments forwarded to generate_diagnostic_suite().

        Returns:
            self for method chaining

        Example:
            >>> pipeline.fit_bayesian(seed=42).plot_diagnostics(output_dir='./diag')
        """
        if self._last_bayesian_result is None:
            raise ValueError("No Bayesian result available. Call fit_bayesian() first.")

        from rheojax.visualization.fit_plotter import generate_diagnostic_suite

        result = generate_diagnostic_suite(
            self._last_bayesian_result,
            style=style,
            output_dir=output_dir,
            prefix=prefix,
            formats=formats,
            dpi=dpi,
            **kwargs,
        )

        self._diagnostic_results = result

        # Expose the first diagnostic figure for save_figure() chaining.
        # generate_diagnostic_suite returns Mapping[str, Figure | Path]
        # (a real dict at runtime).
        if isinstance(result, dict):
            for fig_or_path in result.values():
                if hasattr(fig_or_path, "savefig"):
                    self._current_figure = fig_or_path
                    break

        self.history.append(("plot_diagnostics", str(output_dir)))
        return self

    def plot_transform(
        self,
        transform_name: str | None = None,
        show_intermediate: bool = True,
        show: bool = True,
        style: str = "default",
        **kwargs,
    ) -> Self:
        """Plot the result of a previously applied transform.

        Uses TransformPlotter for per-transform layout dispatch.

        Args:
            transform_name: Name of the transform to plot. If None, uses the
                most recently applied transform.
            show_intermediate: Whether to show before/after comparison.
            show: Whether to call plt.show() (default: True).
            style: Plot style.
            **kwargs: Additional arguments forwarded to TransformPlotter.

        Returns:
            self for method chaining

        Example:
            >>> pipeline.transform('mastercurve', reference_temp=25.0).plot_transform()
        """
        from rheojax.visualization.transform_plotter import TransformPlotter

        if transform_name is None:
            transform_name = self._last_transform_name

        if transform_name is None or transform_name not in self._transform_results:
            available = list(self._transform_results.keys())
            raise ValueError(
                f"No cached result for transform '{transform_name}'. "
                f"Available transforms: {available}. "
                "Call transform() before plot_transform()."
            )

        cached_result, pre_data = self._transform_results[transform_name]
        plotter = TransformPlotter()

        fig, axes = plotter.plot(
            transform_name,
            cached_result,
            input_data=pre_data if show_intermediate else None,
            show_intermediate=show_intermediate,
            style=style,
            **kwargs,
        )

        self._current_figure = fig

        if show:
            import matplotlib.pyplot as plt

            plt.show()

        self.history.append(("plot_transform", transform_name, style))
        return self



[docs]
class Pipeline(_PipelineIO, _PipelinePlotting):
    """Fluent API for rheological analysis workflows.

    This class provides a chainable interface for loading data, applying
    transforms, fitting models, and generating outputs. All methods return
    self to enable method chaining.

    Attributes:
        data: Current RheoData state
        steps: List of (operation, object) tuples for fitted models
        history: List of (operation, details) tuples tracking all operations
        _last_model: Last fitted model for convenience

    Example:
        >>> pipeline = Pipeline()
        >>> pipeline.load('data.csv').fit('maxwell').plot()
    """


[docs]
    def __init__(self, data: RheoData | None = None):
        """Initialize pipeline.

        Args:
            data: Optional initial RheoData. If None, must call load() first.
        """
        self.data = data
        self.steps: list[tuple[str, Any]] = []
        self.history: list[tuple[Any, ...]] = []
        self._last_model: BaseModel | None = None
        self._last_fit_result: Any = None
        self._last_bayesian_result: Any = None
        self._transform_results: dict[str, tuple[Any, RheoData | None]] = {}
        self._last_transform_name: str | None = None
        self._current_figure: Any = None
        self._diagnostic_results: Any = None
        self._last_comparison: Any = None
        self._id = str(uuid.uuid4())[:8]
        logger.debug(
            "Pipeline initialized",
            pipeline_id=self._id,
            has_initial_data=data is not None,
        )



[docs]
    def transform(self, transform: str | BaseTransform, **kwargs) -> Pipeline:
        """Apply a transform to the data.

        Args:
            transform: Transform name (string) or Transform instance
            **kwargs: Arguments passed to transform constructor (if string)

        Returns:
            self for method chaining

        Raises:
            ValueError: If data not loaded or transform not found

        Example:
            >>> pipeline.transform('smooth', window_size=5)
            >>> # or with instance
            >>> from rheojax.transforms import SmoothTransform
            >>> pipeline.transform(SmoothTransform(window_size=5))
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        # Create transform if string
        if isinstance(transform, str):
            transform_obj = TransformRegistry.create(transform, **kwargs)
            transform_name = transform
        else:
            transform_obj = transform
            transform_name = transform_obj.__class__.__name__

        logger.debug(
            "Creating transform",
            pipeline_id=self._id,
            transform=transform_name,
        )

        with log_pipeline_stage(
            logger, "transform", pipeline_id=self._id, transform=transform_name
        ) as ctx:
            try:
                # Apply transform to full RheoData (not raw y array)
                # Transforms expect RheoData with x, y, metadata, domain
                if self.data.x is None:
                    raise ValueError("Loaded data has no x values.")
                ctx["input_shape"] = len(self.data.x)
                pre_transform_data = self.data
                result = transform_obj.transform(self.data)

                # Cache full result + pre-transform data for plot_transform()
                self._transform_results[transform_name] = (result, pre_transform_data)
                self._last_transform_name = transform_name

                if isinstance(result, tuple):
                    self.data = result[0]
                else:
                    self.data = result
            except Exception as e:
                logger.error(
                    "Transform failed",
                    pipeline_id=self._id,
                    transform=transform_name,
                    error=str(e),
                    exc_info=True,
                )
                raise

        # R12-E-002: append to steps so batch replay can replay transforms
        self.steps.append(("transform", transform_obj))
        self.history.append(("transform", transform_name))
        return self


    def _apply_test_mode_metadata(
        self, data: RheoData | None, mode: str | None
    ) -> None:
        """Attach explicit test mode information to loaded data."""

        if data is None or mode is None:
            return

        # RheoData.metadata is typed as always-a-dict (never None) and
        # __post_init__ normalizes any None passed at construction, so this
        # is genuinely unreachable per the type checker. Kept as
        # defense-in-depth in case that invariant is ever relaxed.
        if data.metadata is None:
            data.metadata = {}  # type: ignore[unreachable]

        data.metadata["test_mode"] = mode
        data.metadata.setdefault("detected_test_mode", mode)

        # Persist explicit annotation for downstream helpers that rely on it
        if hasattr(data, "_explicit_test_mode"):
            data._explicit_test_mode = mode


[docs]
    def fit(
        self,
        model: str | BaseModel,
        method: str = "auto",
        **fit_kwargs,
    ) -> Pipeline:
        """Fit a model to the data.

        Args:
            model: Model name (string) or Model instance
            method: Optimization method passed to model.fit() ('nlsq', 'scipy', 'auto').
                Default 'auto' lets the model choose.
            **fit_kwargs: Additional arguments passed to optimizer

        Returns:
            self for method chaining

        Raises:
            ValueError: If data not loaded or model not found

        Example:
            >>> pipeline.fit('maxwell')
            >>> # or with instance
            >>> from rheojax.models.linear import Maxwell
            >>> pipeline.fit(Maxwell())
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        reject_removed_options(fit_kwargs)

        # Create model if string
        if isinstance(model, str):
            model_obj = ModelRegistry.create(model)
            model_name = model
        else:
            model_obj = model
            model_name = model_obj.__class__.__name__

        logger.debug(
            "Creating model for fitting",
            pipeline_id=self._id,
            model=model_name,
        )

        # Fit using model's fit method
        X = self.data.x
        y = self.data.y

        # Convert to numpy for fitting — np.asarray is zero-copy for CPU arrays
        if _is_jax_array(X):
            X = np.asarray(X)
        if _is_jax_array(y):
            y = np.asarray(y)

        with log_pipeline_stage(
            logger,
            "fit",
            pipeline_id=self._id,
            model=model_name,
            data_shape=X.shape,  # type: ignore[union-attr]
        ) as ctx:
            try:
                # PB-001: auto-propagate test_mode from loaded data metadata
                if hasattr(self, "data") and self.data is not None:
                    _meta = getattr(self.data, "metadata", None)
                    if _meta is not None:
                        if "test_mode" not in fit_kwargs:
                            _tm = _meta.get("test_mode")
                            if _tm is not None:
                                fit_kwargs["test_mode"] = _tm
                # R12-E-001: forward method kwarg to model.fit()
                fit_kwargs["method"] = method
                model_obj.fit(X, y, **fit_kwargs)
                self._last_model = model_obj
                self._last_fit_result = None  # Lazily built by get_fit_result()
                self.steps.append(("fit", model_obj))
                try:
                    score = model_obj.score(X, y)
                except Exception:
                    score = float("nan")
                ctx["r_squared"] = score
                self.history.append(("fit", model_name, score))
            except Exception as e:
                logger.error(
                    "Model fitting failed",
                    pipeline_id=self._id,
                    model=model_name,
                    error=str(e),
                    exc_info=True,
                )
                raise

        return self



[docs]
    def predict(
        self, model: BaseModel | None = None, X: np.ndarray | None = None
    ) -> RheoData:
        """Generate predictions from fitted model.

        Args:
            model: Model to use for prediction. If None, uses last fitted model.
            X: Input data for prediction. If None, uses current data.x.

        Returns:
            RheoData with predictions

        Raises:
            ValueError: If no model has been fitted

        Example:
            >>> predictions = pipeline.predict()
        """
        if model is None:
            model = self._last_model

        if model is None:
            raise ValueError("No model fitted. Call fit() first.")

        if X is None:
            if self.data is None:
                raise ValueError("No data available for prediction.")
            if self.data.x is None:
                raise ValueError("No data available for prediction.")
            X = np.asarray(self.data.x)

        # Convert to numpy for prediction — np.asarray is zero-copy for CPU arrays
        if _is_jax_array(X):
            X = np.asarray(X)

        logger.debug(
            "Generating predictions",
            pipeline_id=self._id,
            model=model.__class__.__name__,
            n_points=len(X),
        )

        predictions = model.predict(X)

        return RheoData(
            x=X,
            y=predictions,
            x_units=self.data.x_units if self.data else None,
            y_units=self.data.y_units if self.data else None,
            domain=self.data.domain if self.data else "time",
            metadata={
                **(
                    self.data.metadata
                    if (self.data and self.data.metadata is not None)
                    else {}
                ),
                "type": "prediction",
                "model": model.__class__.__name__,
            },
            validate=False,
        )



[docs]
    def fit_bayesian(
        self,
        model: str | BaseModel | None = None,
        seed: int | None = None,
        **bayesian_kwargs,
    ) -> Pipeline:
        """Run Bayesian (NUTS) inference on current data.

        Uses the last fitted model (or a new one) with NLSQ warm-start.

        Args:
            model: Model name, instance, or None to reuse last fitted model.
            seed: Random seed for reproducibility (default: 0).
            **bayesian_kwargs: Arguments forwarded to model.fit_bayesian()
                (num_warmup, num_samples, num_chains, target_accept_prob, etc.)

        Returns:
            self for method chaining

        Example:
            >>> pipeline.fit('maxwell').fit_bayesian(seed=42, num_warmup=1000)
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        reject_removed_options(bayesian_kwargs)

        # Resolve model
        if model is not None:
            if isinstance(model, str):
                model_obj = ModelRegistry.create(model)
            else:
                model_obj = model
        elif self._last_model is not None:
            model_obj = self._last_model
        else:
            raise ValueError("No model available. Call fit() first or provide a model.")

        X = self.data.x
        y = self.data.y
        if _is_jax_array(X):
            X = np.asarray(X)
        if _is_jax_array(y):
            y = np.asarray(y)

        # PIPE-WARM-001: strip pipeline-level `warm_start` kwarg — it must not
        # be forwarded to model.fit_bayesian(), which passes **nuts_kwargs
        # straight to NUTS().  Passing warm_start=True to NUTS causes TypeError.
        #
        # When warm_start is False, build initial_values from bounds midpoints
        # so the sampler starts from prior-like values rather than NLSQ estimates.
        # We avoid model_obj.__class__() which crashes for models with required
        # constructor args (GeneralizedMaxwell(n_modes), STZ(variant), etc.).
        use_warm_start = bayesian_kwargs.pop("warm_start", True)
        if not use_warm_start:
            import math  # noqa: PLC0415

            midpoint_values: dict[str, float] = {}
            for name in model_obj.parameters.keys():
                param = model_obj.parameters[name]
                lo, hi = param.bounds
                if lo is not None and hi is not None and lo > 0 and hi > 0:
                    midpoint_values[name] = math.sqrt(lo * hi)
                elif lo is not None and hi is not None:
                    midpoint_values[name] = (lo + hi) / 2.0
                else:
                    midpoint_values[name] = 1.0
            bayesian_kwargs.setdefault("initial_values", midpoint_values)

        # Auto-propagate metadata
        # Use explicit `is not None` guards — truthy check swallows falsy-but-valid
        # values such as test_mode="" (empty string).
        _meta = getattr(self.data, "metadata", None) or {}
        if "test_mode" not in bayesian_kwargs:
            _tm = _meta.get("test_mode")
            if _tm is not None:
                bayesian_kwargs["test_mode"] = _tm

        if seed is not None:
            bayesian_kwargs["seed"] = seed

        with log_pipeline_stage(
            logger,
            "fit_bayesian",
            pipeline_id=self._id,
            model=model_obj.__class__.__name__,
        ) as ctx:
            try:
                result = model_obj.fit_bayesian(X, y, **bayesian_kwargs)
                self._last_bayesian_result = result
                self._last_model = model_obj
                # Store sampling kwargs on the model so BatchPipeline can
                # replay with the same configuration.  _last_fit_kwargs only
                # contains protocol kwargs from NLSQ — Bayesian sampling
                # params (num_warmup, num_samples, num_chains, seed) are
                # consumed by NumPyro and never stored there.
                _sampling_keys = {
                    "num_warmup",
                    "num_samples",
                    "num_chains",
                    "seed",
                    "target_accept_prob",
                }
                model_obj._last_bayesian_kwargs = {
                    k: v for k, v in bayesian_kwargs.items() if k in _sampling_keys
                }
                self.steps.append(("fit_bayesian", model_obj))
                self.history.append(("fit_bayesian", model_obj.__class__.__name__))
                ctx["num_samples"] = getattr(result, "num_samples", None)
                ctx["num_chains"] = getattr(result, "num_chains", None)
            except Exception as e:
                logger.error(
                    "Bayesian fitting failed",
                    pipeline_id=self._id,
                    error=str(e),
                    exc_info=True,
                )
                raise

        return self



[docs]
    def get_result(self) -> RheoData:
        """Get current data state.

        Returns:
            Current RheoData

        Example:
            >>> data = pipeline.get_result()
        """
        if self.data is None:
            raise ValueError("No data available. Call load() first.")
        return self.data



[docs]
    def get_history(self) -> list[tuple[Any, ...]]:
        """Get pipeline execution history.

        Returns:
            List of (operation, details) tuples

        Example:
            >>> history = pipeline.get_history()
            >>> for step in history:
            ...     print(step)
        """
        return self.history.copy()



[docs]
    def get_last_model(self) -> BaseModel | None:
        """Get the last fitted model.

        Returns:
            Last fitted BaseModel or None

        Example:
            >>> model = pipeline.get_last_model()
            >>> params = model.get_params()
        """
        return self._last_model



[docs]
    def get_all_models(self) -> list[BaseModel]:
        """Get all fitted models from pipeline.

        Returns:
            List of all fitted models

        Example:
            >>> models = pipeline.get_all_models()
        """
        return [step[1] for step in self.steps if step[0] in ("fit", "fit_nlsq")]



[docs]
    def get_fitted_parameters(self) -> dict[str, float]:
        """Get fitted parameters from the last model as a dictionary.

        This is a convenience method that extracts parameter values from
        the last fitted model's ParameterSet.

        Returns:
            Dictionary mapping parameter names to their fitted values

        Raises:
            ValueError: If no model has been fitted yet

        Example:
            >>> pipeline = Pipeline()
            >>> pipeline.load('data.csv').fit('maxwell')
            >>> params = pipeline.get_fitted_parameters()
            >>> print(params)  # {'G0': 100000.0, 'eta': 1000.0}
            >>> G0 = params['G0']
        """
        if self._last_model is None:
            raise ValueError("No model fitted. Call fit() first.")

        # Extract all parameter values from the model's ParameterSet.
        # get_value() can return None for a declared-but-unset parameter;
        # surface that as an error rather than silently returning a dict
        # with a None value where a float is promised.
        result: dict[str, float] = {}
        for name in self._last_model.parameters.keys():
            value = self._last_model.parameters.get_value(name)
            if value is None:
                raise ValueError(
                    f"Parameter '{name}' has no fitted value. "
                    "Call fit() before get_fitted_parameters()."
                )
            result[name] = value
        return result



[docs]
    def compare_models(
        self,
        models: list[str | BaseModel],
        criterion: str = "aic",
        **fit_kwargs,
    ) -> Pipeline:
        """Compare multiple models on the current data.

        Fits each model and ranks by information criterion.  The best model
        becomes ``_last_model`` and is appended to ``steps``.

        Args:
            models: List of model names (strings) or BaseModel instances.
            criterion: Ranking criterion ('aic', 'aicc', 'bic').
            **fit_kwargs: Extra kwargs forwarded to each ``model.fit()`` call.

        Returns:
            self for method chaining

        Raises:
            ValueError: If no data is loaded.

        Example:
            >>> pipeline.load('data.csv').compare_models(['maxwell', 'zener'])
        """
        if self.data is None:
            raise ValueError("No data loaded. Call load() first.")

        reject_removed_options(fit_kwargs)

        from rheojax.utils.model_selection import compare_models as _compare

        X = self.data.x
        y = self.data.y

        if _is_jax_array(X):
            X = np.asarray(X)
        if _is_jax_array(y):
            y = np.asarray(y)

        # Auto-propagate metadata
        # Use explicit `is not None` guards — truthy check swallows falsy-but-valid
        # values such as test_mode="".
        _meta = getattr(self.data, "metadata", None) or {}
        if "test_mode" not in fit_kwargs:
            _tm = _meta.get("test_mode")
            if _tm is not None:
                fit_kwargs["test_mode"] = _tm

        test_mode = fit_kwargs.pop("test_mode", None)

        comparison = _compare(
            X,
            y,
            models=models,
            test_mode=test_mode,
            criterion=criterion,
            **fit_kwargs,
        )

        self._last_comparison = comparison
        self.history.append(("compare_models", comparison.best_model, criterion))

        # Set the best model as _last_model if available — reuse the
        # already-fitted instance from compare_models() instead of re-fitting.
        if comparison.results:
            best_fr = next(
                (
                    r
                    for r in comparison.results
                    if r.model_name == comparison.best_model
                ),
                None,
            )
            fitted_model = getattr(best_fr, "_fitted_model", None) if best_fr else None
            if fitted_model is not None:
                self._last_model = fitted_model
                self.steps.append(("compare_models", fitted_model))
            else:
                logger.warning(
                    "Best model FitResult has no attached fitted model",
                    model=comparison.best_model,
                )

        return self



[docs]
    def get_fit_result(self) -> Any:
        """Construct a FitResult from the last fitted model.

        Returns:
            FitResult with model metadata, fitted parameters, and statistics.

        Raises:
            ValueError: If no model has been fitted.

        Example:
            >>> result = pipeline.load('data.csv').fit('maxwell').get_fit_result()
            >>> print(result.summary())
        """
        if self._last_model is None:
            raise ValueError("No model fitted. Call fit() first.")

        from rheojax.utils.model_selection import build_fit_result

        X = self.data.x if self.data is not None else None
        y = self.data.y if self.data is not None else None
        test_mode = None
        if self.data is not None:
            _meta = getattr(self.data, "metadata", None) or {}
            test_mode = _meta.get("test_mode")

        return build_fit_result(
            self._last_model,
            X,
            y,
            test_mode=test_mode,
        )



[docs]
    def clone(self) -> Pipeline:
        """Create a copy of the pipeline.

        Returns:
            New Pipeline with copied data and history

        Example:
            >>> pipeline2 = pipeline.clone()
        """
        new_pipeline = Pipeline(data=self.data.copy() if self.data else None)
        new_pipeline.steps = copy.deepcopy(self.steps)
        new_pipeline.history = self.history.copy()
        new_pipeline._last_model = (
            copy.deepcopy(self._last_model) if self._last_model is not None else None
        )
        logger.debug(
            "Pipeline cloned",
            original_id=self._id,
            new_id=new_pipeline._id,
        )
        return new_pipeline



[docs]
    def reset(self) -> Pipeline:
        """Reset pipeline to initial state.

        Returns:
            self for method chaining

        Example:
            >>> pipeline.reset()
        """
        logger.debug("Pipeline reset", pipeline_id=self._id)
        self.data = None
        self.steps = []
        self.history = []
        self._last_model = None
        self._last_fit_result = None
        self._last_bayesian_result = None
        self._transform_results = {}
        self._last_transform_name = None
        self._current_figure = None
        self._diagnostic_results = None
        self._last_comparison = None
        return self



[docs]
    def __repr__(self) -> str:
        """String representation of pipeline."""
        n_steps = len(self.history)
        has_data = self.data is not None
        has_model = self._last_model is not None
        return f"Pipeline(steps={n_steps}, has_data={has_data}, has_model={has_model})"




__all__ = ["Pipeline"]