Source code for pytcl.static_estimation.maximum_likelihood

"""
Maximum Likelihood Estimation and Information Theory.

This module provides tools for maximum likelihood estimation, Fisher
information computation, and Cramer-Rao bound analysis.

References
----------
.. [1] S. M. Kay, "Fundamentals of Statistical Signal Processing:
       Estimation Theory," Prentice Hall, 1993.
.. [2] H. L. Van Trees, "Detection, Estimation, and Modulation Theory,"
       Wiley, 2001.
"""

from typing import Any, Callable, NamedTuple, Optional

import numpy as np
from numpy.typing import ArrayLike, NDArray



[docs]
class MLResult(NamedTuple):
    """Result of maximum likelihood estimation.

    Attributes
    ----------
    theta : ndarray
        Estimated parameters.
    log_likelihood : float
        Log-likelihood at the estimate.
    fisher_info : ndarray
        Fisher information matrix at the estimate.
    covariance : ndarray
        Estimated covariance (inverse Fisher information).
    n_iter : int
        Number of iterations (for iterative methods).
    converged : bool
        Whether the optimization converged.
    """

    theta: NDArray[np.floating]
    log_likelihood: float
    fisher_info: NDArray[np.floating]
    covariance: NDArray[np.floating]
    n_iter: int
    converged: bool




[docs]
class CRBResult(NamedTuple):
    """Result of Cramer-Rao bound computation.

    Attributes
    ----------
    crb_matrix : ndarray
        Cramer-Rao bound matrix (inverse Fisher information).
    variances : ndarray
        Diagonal elements (variance bounds for each parameter).
    std_bounds : ndarray
        Square root of variances (standard deviation bounds).
    fisher_info : ndarray
        Fisher information matrix used.
    is_efficient : ndarray or None
        Boolean mask indicating which estimators achieve the bound.
    """

    crb_matrix: NDArray[np.floating]
    variances: NDArray[np.floating]
    std_bounds: NDArray[np.floating]
    fisher_info: NDArray[np.floating]
    is_efficient: Optional[NDArray[np.bool_]]



# =============================================================================
# Fisher Information
# =============================================================================



[docs]
def fisher_information_numerical(
    log_likelihood: Callable[[np.ndarray[Any, Any]], float],
    theta: ArrayLike,
    h: float = 1e-5,
) -> np.ndarray[Any, Any]:
    """
    Compute Fisher information matrix numerically.

    Uses the negative expected Hessian of the log-likelihood.

    Parameters
    ----------
    log_likelihood : callable
        Log-likelihood function L(theta).
    theta : array_like
        Parameter vector at which to evaluate.
    h : float, optional
        Step size for numerical differentiation. Default 1e-5.

    Returns
    -------
    fisher : ndarray
        Fisher information matrix of shape (n_params, n_params).

    Examples
    --------
    >>> def log_lik(theta):
    ...     return -0.5 * np.sum((data - theta[0])**2 / theta[1])
    >>> theta = np.array([0.0, 1.0])
    >>> F = fisher_information_numerical(log_lik, theta)

    Notes
    -----
    For a single observation, the Fisher information is:
        I(theta) = -E[d^2 log p(x|theta) / d theta^2]

    This function approximates the Hessian using central differences.
    """
    theta = np.asarray(theta, dtype=np.float64)
    n = len(theta)
    fisher = np.zeros((n, n))

    # Compute Hessian using central differences
    for i in range(n):
        for j in range(i, n):
            # Four-point formula for mixed partials
            theta_pp = theta.copy()
            theta_pm = theta.copy()
            theta_mp = theta.copy()
            theta_mm = theta.copy()

            theta_pp[i] += h
            theta_pp[j] += h
            theta_pm[i] += h
            theta_pm[j] -= h
            theta_mp[i] -= h
            theta_mp[j] += h
            theta_mm[i] -= h
            theta_mm[j] -= h

            hess_ij = (
                log_likelihood(theta_pp)
                - log_likelihood(theta_pm)
                - log_likelihood(theta_mp)
                + log_likelihood(theta_mm)
            ) / (4 * h * h)

            # Fisher info is negative expected Hessian
            fisher[i, j] = -hess_ij
            fisher[j, i] = -hess_ij

    return fisher




[docs]
def fisher_information_gaussian(
    jacobian: ArrayLike,
    noise_cov: ArrayLike,
) -> NDArray[np.floating]:
    """
    Fisher information for Gaussian measurement model.

    For the model y = h(theta) + noise, where noise ~ N(0, R),
    the Fisher information is J^T R^{-1} J.

    Parameters
    ----------
    jacobian : array_like
        Jacobian matrix dh/dtheta of shape (m, n).
    noise_cov : array_like
        Measurement noise covariance R of shape (m, m).

    Returns
    -------
    fisher : ndarray
        Fisher information matrix of shape (n, n).

    Examples
    --------
    >>> H = np.array([[1, 0], [0, 1], [1, 1]])  # 3 measurements, 2 params
    >>> R = np.eye(3) * 0.1
    >>> F = fisher_information_gaussian(H, R)

    Notes
    -----
    This is the standard result for linear Gaussian models and
    provides the information content of measurements about parameters.
    """
    J = np.asarray(jacobian, dtype=np.float64)
    R = np.asarray(noise_cov, dtype=np.float64)

    R_inv = np.linalg.inv(R)
    return J.T @ R_inv @ J




[docs]
def fisher_information_exponential_family(
    sufficient_stats: Callable[
        [np.ndarray[Any, Any], np.ndarray[Any, Any]], np.ndarray[Any, Any]
    ],
    theta: ArrayLike,
    data: ArrayLike,
    h: float = 1e-5,
) -> np.ndarray[Any, Any]:
    """
    Fisher information for exponential family distributions.

    For exponential family: p(x|theta) = h(x) exp(eta(theta)^T T(x) - A(theta))
    The Fisher information equals the covariance of sufficient statistics.

    Parameters
    ----------
    sufficient_stats : callable
        Function T(x, theta) returning sufficient statistics.
    theta : array_like
        Natural parameters.
    data : array_like
        Observed data of shape (n_samples, ...).
    h : float, optional
        Step size for numerical differentiation.

    Returns
    -------
    fisher : ndarray
        Fisher information matrix.

    Notes
    -----
    For exponential families, I(theta) = Var[T(X)] = d^2 A(theta) / d theta^2,
    where A(theta) is the log-partition function.

    Examples
    --------
    >>> def suff_stats(x, theta):
    ...     return np.array([x, x**2])  # Mean and second moment
    >>> data = np.random.normal(0, 1, 100)
    >>> theta = np.array([0.0, 1.0])
    >>> F = fisher_information_exponential_family(suff_stats, theta, data)
    """
    theta = np.asarray(theta, dtype=np.float64)
    data = np.asarray(data, dtype=np.float64)

    # Compute sufficient statistics for all data
    T = np.array([sufficient_stats(x, theta) for x in data])

    # Fisher info is covariance of sufficient statistics
    return np.cov(T.T)




[docs]
def observed_fisher_information(
    log_likelihood: Callable[[np.ndarray[Any, Any]], float],
    theta: ArrayLike,
    h: float = 1e-5,
) -> np.ndarray[Any, Any]:
    """
    Compute observed Fisher information (negative Hessian).

    Unlike the expected Fisher information, this uses the actual
    observed data rather than the expectation.

    Parameters
    ----------
    log_likelihood : callable
        Log-likelihood function.
    theta : array_like
        Parameter estimate.
    h : float, optional
        Step size for numerical differentiation.

    Returns
    -------
    observed_fisher : ndarray
        Observed Fisher information matrix.

    Notes
    -----
    The observed Fisher information is often more accurate for
    finite samples and is asymptotically equivalent to the
    expected Fisher information.

    Examples
    --------
    >>> data = np.array([1.2, 0.8, 1.1, 0.9, 1.0])
    >>> def log_lik(theta):
    ...     return -0.5 * np.sum((data - theta[0])**2 / theta[1]) - 2.5 * np.log(theta[1])
    >>> theta = np.array([1.0, 0.1])
    >>> F_obs = observed_fisher_information(log_lik, theta)
    """
    return fisher_information_numerical(log_likelihood, theta, h)



# =============================================================================
# Cramer-Rao Bound
# =============================================================================



[docs]
def cramer_rao_bound(
    fisher_info: ArrayLike,
    estimator_variance: Optional[ArrayLike] = None,
) -> CRBResult:
    """
    Compute Cramer-Rao lower bound on estimator variance.

    Parameters
    ----------
    fisher_info : array_like
        Fisher information matrix of shape (n, n).
    estimator_variance : array_like, optional
        Actual estimator variance for efficiency check.

    Returns
    -------
    result : CRBResult
        Named tuple with CRB matrix, variances, and efficiency info.

    Examples
    --------
    >>> F = np.array([[10, 0], [0, 5]])  # Fisher info
    >>> result = cramer_rao_bound(F)
    >>> result.variances  # Minimum achievable variances
    array([0.1, 0.2])

    Notes
    -----
    The Cramer-Rao bound states that for any unbiased estimator:
        Var(theta_hat) >= I(theta)^{-1}

    The bound is achieved by efficient estimators, such as the MLE
    for exponential family distributions.
    """
    F = np.asarray(fisher_info, dtype=np.float64)

    # CRB is inverse of Fisher information
    try:
        crb = np.linalg.inv(F)
    except np.linalg.LinAlgError:
        # Use pseudoinverse for singular Fisher info
        crb = np.linalg.pinv(F)

    variances = np.diag(crb)
    std_bounds = np.sqrt(np.maximum(variances, 0))

    # Check efficiency if estimator variance provided
    is_efficient = None
    if estimator_variance is not None:
        est_var = np.asarray(estimator_variance, dtype=np.float64)
        if est_var.ndim == 1:
            # Compare diagonal elements
            is_efficient = np.isclose(est_var, variances, rtol=0.1)
        else:
            # Compare full matrices
            is_efficient = np.isclose(np.diag(est_var), variances, rtol=0.1)

    return CRBResult(
        crb_matrix=crb,
        variances=variances,
        std_bounds=std_bounds,
        fisher_info=F,
        is_efficient=is_efficient,
    )




[docs]
def cramer_rao_bound_biased(
    fisher_info: ArrayLike,
    bias_gradient: ArrayLike,
) -> NDArray[np.floating]:
    """
    Cramer-Rao bound for biased estimators.

    Parameters
    ----------
    fisher_info : array_like
        Fisher information matrix.
    bias_gradient : array_like
        Gradient of the bias with respect to theta.

    Returns
    -------
    crb : ndarray
        Cramer-Rao bound matrix for the biased estimator.

    Notes
    -----
    For a biased estimator with bias b(theta), the CRB becomes:
        Var(theta_hat) >= (I + db/dtheta) I^{-1} (I + db/dtheta)^T

    Examples
    --------
    >>> F = np.array([[10.0, 0], [0, 5.0]])  # Fisher info
    >>> db = np.array([[0.1, 0], [0, 0.2]])  # Bias gradient
    >>> crb_biased = cramer_rao_bound_biased(F, db)
    >>> crb_biased.shape
    (2, 2)
    """
    F = np.asarray(fisher_info, dtype=np.float64)
    db = np.asarray(bias_gradient, dtype=np.float64)

    n = F.shape[0]
    I_plus_db = np.eye(n) + db

    F_inv = np.linalg.inv(F)
    return I_plus_db @ F_inv @ I_plus_db.T




[docs]
def efficiency(
    estimator_variance: ArrayLike,
    crb: ArrayLike,
) -> NDArray[np.floating]:
    """
    Compute estimator efficiency relative to CRB.

    Parameters
    ----------
    estimator_variance : array_like
        Variance of the estimator (scalar or diagonal).
    crb : array_like
        Cramer-Rao bound (scalar or diagonal).

    Returns
    -------
    eff : ndarray
        Efficiency values in [0, 1]. Value of 1 means efficient.

    Examples
    --------
    >>> var_est = np.array([0.12, 0.25])
    >>> crb = np.array([0.1, 0.2])
    >>> efficiency(var_est, crb)
    array([0.833, 0.8])
    """
    var = np.asarray(estimator_variance, dtype=np.float64)
    crb = np.asarray(crb, dtype=np.float64)

    # Efficiency = CRB / actual variance
    return np.where(var > 0, crb / var, 0.0)



# =============================================================================
# Maximum Likelihood Estimation
# =============================================================================



[docs]
def mle_newton_raphson(
    log_likelihood: Callable[[np.ndarray[Any, Any]], float],
    score: Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]],
    theta_init: ArrayLike,
    hessian: Optional[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]] = None,
    max_iter: int = 100,
    tol: float = 1e-8,
    h: float = 1e-5,
) -> MLResult:
    """
    Maximum likelihood estimation using Newton-Raphson.

    Parameters
    ----------
    log_likelihood : callable
        Log-likelihood function L(theta).
    score : callable
        Score function (gradient of log-likelihood).
    theta_init : array_like
        Initial parameter guess.
    hessian : callable, optional
        Hessian function. If None, computed numerically.
    max_iter : int, optional
        Maximum iterations. Default 100.
    tol : float, optional
        Convergence tolerance. Default 1e-8.
    h : float, optional
        Step size for numerical Hessian.

    Returns
    -------
    result : MLResult
        MLE result with estimate, Fisher info, and covariance.

    Examples
    --------
    >>> def log_lik(theta):
    ...     return -0.5 * np.sum((data - theta[0])**2)
    >>> def score(theta):
    ...     return np.array([np.sum(data - theta[0])])
    >>> result = mle_newton_raphson(log_lik, score, np.array([0.0]))

    Notes
    -----
    Newton-Raphson update: theta_{n+1} = theta_n - H^{-1} @ score
    where H is the Hessian of the log-likelihood.
    """
    theta = np.asarray(theta_init, dtype=np.float64).copy()
    n_params = len(theta)

    converged = False

    def numerical_hessian(t: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
        H = np.zeros((n_params, n_params))
        for i in range(n_params):
            for j in range(i, n_params):
                t_pp = t.copy()
                t_pm = t.copy()
                t_mp = t.copy()
                t_mm = t.copy()
                t_pp[i] += h
                t_pp[j] += h
                t_pm[i] += h
                t_pm[j] -= h
                t_mp[i] -= h
                t_mp[j] += h
                t_mm[i] -= h
                t_mm[j] -= h
                H[i, j] = (
                    log_likelihood(t_pp)
                    - log_likelihood(t_pm)
                    - log_likelihood(t_mp)
                    + log_likelihood(t_mm)
                ) / (4 * h * h)
                H[j, i] = H[i, j]
        return H

    hess_func = hessian if hessian is not None else numerical_hessian

    for iteration in range(max_iter):
        g = score(theta)
        H = hess_func(theta)

        # Newton step
        try:
            delta = np.linalg.solve(H, -g)
        except np.linalg.LinAlgError:
            delta = np.linalg.lstsq(H, -g, rcond=None)[0]

        theta_new = theta + delta

        # Check convergence
        if np.linalg.norm(delta) < tol * (1 + np.linalg.norm(theta)):
            converged = True
            theta = theta_new
            break

        theta = theta_new

    # Compute final Fisher information
    fisher = -hess_func(theta)

    # Ensure positive definiteness
    eigvals = np.linalg.eigvalsh(fisher)
    if np.any(eigvals <= 0):
        # Add regularization
        fisher = fisher + np.eye(n_params) * (abs(min(eigvals)) + 1e-6)

    try:
        cov = np.linalg.inv(fisher)
    except np.linalg.LinAlgError:
        cov = np.linalg.pinv(fisher)

    return MLResult(
        theta=theta,
        log_likelihood=float(log_likelihood(theta)),
        fisher_info=fisher,
        covariance=cov,
        n_iter=iteration + 1,
        converged=converged,
    )




[docs]
def mle_scoring(
    log_likelihood: Callable[[np.ndarray[Any, Any]], float],
    score: Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]],
    fisher_info: Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]],
    theta_init: ArrayLike,
    max_iter: int = 100,
    tol: float = 1e-8,
) -> MLResult:
    """
    Maximum likelihood estimation using Fisher scoring.

    Uses expected Fisher information instead of observed Hessian,
    which can be more stable.

    Parameters
    ----------
    log_likelihood : callable
        Log-likelihood function.
    score : callable
        Score function (gradient).
    fisher_info : callable
        Fisher information function.
    theta_init : array_like
        Initial parameter guess.
    max_iter : int, optional
        Maximum iterations.
    tol : float, optional
        Convergence tolerance.

    Returns
    -------
    result : MLResult
        MLE result.

    Notes
    -----
    Fisher scoring update: theta_{n+1} = theta_n + I(theta_n)^{-1} @ score
    This is equivalent to Newton-Raphson when I(theta) = -E[H].

    Examples
    --------
    >>> data = np.array([1.0, 1.1, 0.9, 1.2, 0.8])
    >>> def log_lik(theta):
    ...     return -0.5 * len(data) * np.log(2*np.pi) - np.sum((data - theta[0])**2) / 2
    >>> def score(theta):
    ...     return np.array([np.sum(data - theta[0])])
    >>> def fisher(theta):
    ...     return np.array([[len(data)]])
    >>> result = mle_scoring(log_lik, score, fisher, np.array([0.0]))
    """
    theta = np.asarray(theta_init, dtype=np.float64).copy()

    converged = False

    for iteration in range(max_iter):
        g = score(theta)
        F = fisher_info(theta)

        # Fisher scoring step
        try:
            delta = np.linalg.solve(F, g)
        except np.linalg.LinAlgError:
            delta = np.linalg.lstsq(F, g, rcond=None)[0]

        theta_new = theta + delta

        # Check convergence
        if np.linalg.norm(delta) < tol * (1 + np.linalg.norm(theta)):
            converged = True
            theta = theta_new
            break

        theta = theta_new

    # Final Fisher information and covariance
    F = fisher_info(theta)
    try:
        cov = np.linalg.inv(F)
    except np.linalg.LinAlgError:
        cov = np.linalg.pinv(F)

    return MLResult(
        theta=theta,
        log_likelihood=float(log_likelihood(theta)),
        fisher_info=F,
        covariance=cov,
        n_iter=iteration + 1,
        converged=converged,
    )




[docs]
def mle_gaussian(
    data: ArrayLike,
    estimate_mean: bool = True,
    estimate_variance: bool = True,
) -> MLResult:
    """
    Closed-form MLE for Gaussian distribution.

    Parameters
    ----------
    data : array_like
        Observed data of shape (n_samples,) or (n_samples, n_features).
    estimate_mean : bool, optional
        Whether to estimate mean. Default True.
    estimate_variance : bool, optional
        Whether to estimate variance. Default True.

    Returns
    -------
    result : MLResult
        MLE result with mean and/or variance estimates.

    Examples
    --------
    >>> data = np.random.normal(5, 2, 1000)
    >>> result = mle_gaussian(data)
    >>> result.theta  # [mean, variance]
    """
    data = np.asarray(data, dtype=np.float64)

    if data.ndim == 1:
        n = len(data)
        mean_mle = np.mean(data) if estimate_mean else 0.0
        var_mle = np.var(data) if estimate_variance else 1.0

        theta = []
        if estimate_mean:
            theta.append(mean_mle)
        if estimate_variance:
            theta.append(var_mle)
        theta = np.array(theta)

        # Log-likelihood
        log_lik = -n / 2 * np.log(2 * np.pi * var_mle) - np.sum(
            (data - mean_mle) ** 2
        ) / (2 * var_mle)

        # Fisher information
        n_params = len(theta)
        fisher = np.zeros((n_params, n_params))
        idx = 0
        if estimate_mean:
            fisher[idx, idx] = n / var_mle
            idx += 1
        if estimate_variance:
            fisher[idx, idx] = n / (2 * var_mle**2)

        cov = np.linalg.inv(fisher)

    else:
        # Multivariate case
        n, d = data.shape
        mean_mle = np.mean(data, axis=0)
        cov_mle = np.cov(data.T, ddof=0)

        theta = np.concatenate([mean_mle, cov_mle.flatten()])

        # Log-likelihood
        sign, logdet = np.linalg.slogdet(cov_mle)
        centered = data - mean_mle
        cov_inv = np.linalg.inv(cov_mle)
        log_lik = (
            -n * d / 2 * np.log(2 * np.pi)
            - n / 2 * logdet
            - 0.5 * np.sum(centered @ cov_inv * centered)
        )

        # Fisher info (simplified diagonal approximation)
        n_params = len(theta)
        fisher = np.eye(n_params) * n
        cov = np.eye(n_params) / n

    return MLResult(
        theta=theta,
        log_likelihood=float(log_lik),
        fisher_info=fisher,
        covariance=cov,
        n_iter=1,
        converged=True,
    )



# =============================================================================
# Information Criteria
# =============================================================================



[docs]
def aic(log_likelihood: float, n_params: int) -> float:
    """
    Akaike Information Criterion.

    Parameters
    ----------
    log_likelihood : float
        Log-likelihood at the MLE.
    n_params : int
        Number of parameters.

    Returns
    -------
    aic : float
        AIC value (lower is better).

    Notes
    -----
    AIC = -2 * log_likelihood + 2 * n_params

    Examples
    --------
    >>> log_lik = -100.0
    >>> n_params = 3
    >>> aic(log_lik, n_params)
    206.0
    """
    return -2 * log_likelihood + 2 * n_params




[docs]
def bic(log_likelihood: float, n_params: int, n_samples: int) -> float:
    """
    Bayesian Information Criterion.

    Parameters
    ----------
    log_likelihood : float
        Log-likelihood at the MLE.
    n_params : int
        Number of parameters.
    n_samples : int
        Number of samples.

    Returns
    -------
    bic : float
        BIC value (lower is better).

    Notes
    -----
    BIC = -2 * log_likelihood + n_params * log(n_samples)

    Examples
    --------
    >>> log_lik = -100.0
    >>> bic(log_lik, n_params=3, n_samples=100)
    213.81551055796427
    """
    return -2 * log_likelihood + n_params * np.log(n_samples)




[docs]
def aicc(log_likelihood: float, n_params: int, n_samples: int) -> float:
    """
    Corrected Akaike Information Criterion.

    Parameters
    ----------
    log_likelihood : float
        Log-likelihood at the MLE.
    n_params : int
        Number of parameters.
    n_samples : int
        Number of samples.

    Returns
    -------
    aicc : float
        AICc value (lower is better).

    Notes
    -----
    AICc adds a correction for small sample sizes:
    AICc = AIC + 2*k*(k+1)/(n-k-1)

    Examples
    --------
    >>> log_lik = -50.0
    >>> aicc(log_lik, n_params=3, n_samples=20)
    109.5
    """
    k = n_params
    n = n_samples
    if n - k - 1 <= 0:
        return np.inf
    return aic(log_likelihood, k) + 2 * k * (k + 1) / (n - k - 1)



__all__ = [
    "MLResult",
    "CRBResult",
    "fisher_information_numerical",
    "fisher_information_gaussian",
    "fisher_information_exponential_family",
    "observed_fisher_information",
    "cramer_rao_bound",
    "cramer_rao_bound_biased",
    "efficiency",
    "mle_newton_raphson",
    "mle_scoring",
    "mle_gaussian",
    "aic",
    "bic",
    "aicc",
]