Source code for pytcl.gpu.utils

"""
GPU utility functions for array management and device detection.

This module provides utilities for:
- Checking GPU availability (CUDA via CuPy or Apple Silicon via MLX)
- Transferring arrays between CPU and GPU
- Getting the appropriate array module (numpy, cupy, or mlx)
- Memory management
- Automatic backend selection based on platform

The module automatically selects the appropriate backend:
- On Apple Silicon (M1/M2/M3): Uses MLX if available
- On systems with NVIDIA GPUs: Uses CuPy if available
- Falls back to CPU (numpy) if no GPU backend is available

Examples
--------
>>> from pytcl.gpu.utils import is_gpu_available, to_gpu, to_cpu
>>> if is_gpu_available():
...     x_gpu = to_gpu(x_numpy)
...     # ... perform GPU operations ...
...     x_cpu = to_cpu(x_gpu)
"""

import logging
import platform
from functools import lru_cache
from typing import Any, Literal, Union

import numpy as np
from numpy.typing import ArrayLike, NDArray

from pytcl.core.optional_deps import is_available

# Module logger
_logger = logging.getLogger("pytcl.gpu.utils")

# Type alias for arrays that could be numpy, cupy, or mlx
GPUArray = Any  # Would be cp.ndarray or mx.array if backend is available

# Backend type
BackendType = Literal["cupy", "mlx", "numpy"]



[docs]
@lru_cache(maxsize=1)
def is_apple_silicon() -> bool:
    """
    Check if running on Apple Silicon (ARM64 Mac).

    Returns
    -------
    bool
        True if running on Apple Silicon (M1, M2, M3, etc.).

    Examples
    --------
    >>> from pytcl.gpu.utils import is_apple_silicon
    >>> if is_apple_silicon():
    ...     print("Running on Apple Silicon")
    """
    return platform.system() == "Darwin" and platform.machine() == "arm64"




[docs]
@lru_cache(maxsize=1)
def is_mlx_available() -> bool:
    """
    Check if MLX acceleration is available (Apple Silicon).

    Returns True if:
    - Running on Apple Silicon (ARM64 Mac)
    - MLX is installed

    Returns
    -------
    bool
        True if MLX acceleration is available.

    Examples
    --------
    >>> from pytcl.gpu.utils import is_mlx_available
    >>> if is_mlx_available():
    ...     print("MLX acceleration enabled")
    """
    if not is_apple_silicon():
        _logger.debug("Not on Apple Silicon, MLX not applicable")
        return False

    if not is_available("mlx"):
        _logger.debug("MLX not installed")
        return False

    try:
        import mlx.core as mx

        # Verify MLX works by creating a simple array
        _ = mx.array([1.0, 2.0, 3.0])
        _logger.info("MLX available on Apple Silicon")
        return True
    except Exception as e:
        _logger.debug("MLX not functional: %s", e)
        return False




[docs]
@lru_cache(maxsize=1)
def is_cupy_available() -> bool:
    """
    Check if CuPy (CUDA) acceleration is available.

    Returns True if:
    - CuPy is installed
    - A CUDA-capable GPU is detected
    - CUDA runtime is functional

    Returns
    -------
    bool
        True if CuPy acceleration is available.

    Examples
    --------
    >>> from pytcl.gpu.utils import is_cupy_available
    >>> if is_cupy_available():
    ...     print("CUDA GPU available")
    """
    if not is_available("cupy"):
        _logger.debug("CuPy not installed")
        return False

    try:
        import cupy as cp

        # Try to access a GPU device
        device = cp.cuda.Device(0)
        _ = device.compute_capability
        _logger.info("CuPy available: %s", device.pci_bus_id)
        return True
    except Exception as e:
        _logger.debug("CuPy/CUDA not available: %s", e)
        return False




[docs]
@lru_cache(maxsize=1)
def get_backend() -> BackendType:
    """
    Get the best available GPU backend for the current platform.

    Priority:
    1. MLX on Apple Silicon
    2. CuPy on systems with NVIDIA GPUs
    3. numpy (CPU fallback)

    Returns
    -------
    str
        One of "mlx", "cupy", or "numpy".

    Examples
    --------
    >>> from pytcl.gpu.utils import get_backend
    >>> backend = get_backend()
    >>> print(f"Using {backend} backend")
    """
    if is_apple_silicon() and is_mlx_available():
        return "mlx"
    elif is_cupy_available():
        return "cupy"
    else:
        return "numpy"




[docs]
@lru_cache(maxsize=1)
def is_gpu_available() -> bool:
    """
    Check if GPU acceleration is available.

    Returns True if either:
    - MLX is available (Apple Silicon)
    - CuPy is available with a CUDA GPU

    Returns
    -------
    bool
        True if GPU acceleration is available.

    Examples
    --------
    >>> from pytcl.gpu.utils import is_gpu_available
    >>> if is_gpu_available():
    ...     print("GPU acceleration enabled")
    ... else:
    ...     print("Falling back to CPU")

    Notes
    -----
    The result is cached after the first call for performance.
    Use `get_backend()` to determine which backend is being used.
    """
    return is_mlx_available() or is_cupy_available()




[docs]
def get_array_module(arr: ArrayLike) -> Any:
    """
    Get the array module (numpy, cupy, or mlx.core) for the given array.

    This function enables writing code that works with numpy, cupy, and mlx
    arrays by returning the appropriate module.

    Parameters
    ----------
    arr : array_like
        Input array (numpy, cupy, or mlx).

    Returns
    -------
    module
        numpy, cupy, or mlx.core module, depending on the input array type.

    Examples
    --------
    >>> import numpy as np
    >>> from pytcl.gpu.utils import get_array_module
    >>> x = np.array([1, 2, 3])
    >>> xp = get_array_module(x)
    >>> xp is np
    True

    >>> # With CuPy array
    >>> import cupy as cp
    >>> x_gpu = cp.array([1, 2, 3])
    >>> xp = get_array_module(x_gpu)
    >>> xp is cp
    True

    >>> # With MLX array
    >>> import mlx.core as mx
    >>> x_mlx = mx.array([1, 2, 3])
    >>> xp = get_array_module(x_mlx)
    >>> xp.__name__
    'mlx.core'
    """
    # Check for MLX array first
    if is_available("mlx"):
        import mlx.core as mx

        if isinstance(arr, mx.array):
            return mx

    # Check for CuPy array
    if is_available("cupy"):
        import cupy as cp

        if isinstance(arr, cp.ndarray):
            return cp

    return np




[docs]
def to_gpu(arr: ArrayLike, dtype: Any = None, backend: BackendType = None) -> GPUArray:
    """
    Transfer an array to GPU memory.

    Automatically selects the best available backend (MLX on Apple Silicon,
    CuPy on NVIDIA GPUs) unless a specific backend is requested.

    Parameters
    ----------
    arr : array_like
        Input array (typically numpy).
    dtype : dtype, optional
        Data type for the GPU array. If None, uses the input dtype.
    backend : str, optional
        Specific backend to use ("mlx", "cupy"). If None, auto-selects.

    Returns
    -------
    GPUArray
        Array in GPU memory (cupy.ndarray or mlx.array).

    Raises
    ------
    DependencyError
        If required backend is not installed.
    RuntimeError
        If no GPU is available.

    Examples
    --------
    >>> import numpy as np
    >>> from pytcl.gpu.utils import to_gpu, is_gpu_available
    >>> x = np.array([1.0, 2.0, 3.0])
    >>> if is_gpu_available():
    ...     x_gpu = to_gpu(x)
    ...     print(type(x_gpu).__name__)
    'ndarray'  # cupy.ndarray or 'array' for mlx

    Notes
    -----
    If the input is already a GPU array, it is returned as-is (or converted
    to the requested dtype).
    """
    from pytcl.core.optional_deps import import_optional

    if not is_gpu_available():
        raise RuntimeError(
            "No GPU available. Check CUDA installation or MLX availability."
        )

    # Determine backend
    if backend is None:
        backend = get_backend()

    # Use MLX backend
    if backend == "mlx":
        mx = import_optional(
            "mlx.core",
            package="mlx",
            extra="gpu-apple",
            feature="Apple Silicon GPU acceleration",
        )

        # If already an MLX array
        if isinstance(arr, mx.array):
            if dtype is not None:
                # MLX uses different dtype handling
                return arr.astype(_numpy_dtype_to_mlx(mx, dtype))
            return arr

        # Convert to numpy first if needed
        arr_np = np.asarray(arr)
        if dtype is not None:
            arr_np = arr_np.astype(dtype)

        return mx.array(arr_np)

    # Use CuPy backend
    else:
        cp = import_optional("cupy", extra="gpu", feature="GPU acceleration")

        # If already a CuPy array
        if isinstance(arr, cp.ndarray):
            if dtype is not None and arr.dtype != dtype:
                return arr.astype(dtype)
            return arr

        # Convert to numpy first if needed
        arr_np = np.asarray(arr)
        if dtype is not None:
            arr_np = arr_np.astype(dtype)

        return cp.asarray(arr_np)



def _numpy_dtype_to_mlx(mx: Any, dtype: Any) -> Any:
    """Convert numpy dtype to MLX dtype."""
    dtype_map = {
        np.float32: mx.float32,
        np.float64: mx.float32,  # MLX prefers float32
        np.int32: mx.int32,
        np.int64: mx.int64,
        np.bool_: mx.bool_,
    }
    if hasattr(dtype, "type"):
        dtype = dtype.type
    return dtype_map.get(dtype, mx.float32)



[docs]
def to_cpu(arr: Union[ArrayLike, GPUArray]) -> NDArray[np.floating]:
    """
    Transfer an array from GPU to CPU memory.

    Parameters
    ----------
    arr : array_like, cupy.ndarray, or mlx.array
        Input array (numpy, cupy, or mlx).

    Returns
    -------
    numpy.ndarray
        Array in CPU memory.

    Examples
    --------
    >>> import numpy as np
    >>> from pytcl.gpu.utils import to_gpu, to_cpu, is_gpu_available
    >>> x = np.array([1.0, 2.0, 3.0])
    >>> if is_gpu_available():
    ...     x_gpu = to_gpu(x)
    ...     x_cpu = to_cpu(x_gpu)
    ...     np.allclose(x, x_cpu)
    True

    Notes
    -----
    If the input is already a numpy array, it is returned as-is.
    """
    # Already numpy
    if isinstance(arr, np.ndarray):
        return arr

    # Check if it's an MLX array
    if is_available("mlx"):
        import mlx.core as mx

        if isinstance(arr, mx.array):
            return np.array(arr)

    # Check if it's a CuPy array
    if is_available("cupy"):
        import cupy as cp

        if isinstance(arr, cp.ndarray):
            return cp.asnumpy(arr)

    # Fallback: convert via numpy
    return np.asarray(arr)




[docs]
def ensure_gpu_array(
    arr: ArrayLike,
    dtype: Any = np.float64,
    backend: BackendType = None,
) -> GPUArray:
    """
    Ensure an array is on the GPU with the specified dtype.

    Parameters
    ----------
    arr : array_like
        Input array.
    dtype : dtype
        Desired data type.
    backend : str, optional
        Specific backend to use ("mlx", "cupy"). If None, auto-selects.

    Returns
    -------
    GPUArray
        Array on GPU with specified dtype (cupy.ndarray or mlx.array).

    Examples
    --------
    >>> import numpy as np
    >>> from pytcl.gpu.utils import ensure_gpu_array, is_gpu_available
    >>> x = np.array([1, 2, 3])
    >>> if is_gpu_available():
    ...     x_gpu = ensure_gpu_array(x, dtype=np.float32)
    ...     print(x_gpu.dtype)
    """
    gpu_arr = to_gpu(arr, backend=backend)

    # MLX doesn't support float64 well, use float32
    if backend == "mlx" or (backend is None and get_backend() == "mlx"):
        if dtype == np.float64:
            dtype = np.float32

    if hasattr(gpu_arr, "dtype") and gpu_arr.dtype != dtype:
        if get_backend() == "mlx":
            import mlx.core as mx

            gpu_arr = gpu_arr.astype(_numpy_dtype_to_mlx(mx, dtype))
        else:
            gpu_arr = gpu_arr.astype(dtype)
    return gpu_arr




[docs]
def sync_gpu() -> None:
    """
    Synchronize GPU operations.

    This blocks until all pending GPU operations are complete.
    Useful for accurate timing measurements.

    Examples
    --------
    >>> import time
    >>> from pytcl.gpu.utils import sync_gpu, is_gpu_available
    >>> if is_gpu_available():
    ...     # ... perform GPU operations ...
    ...     sync_gpu()  # Wait for completion
    ...     elapsed = time.time() - start
    """
    backend = get_backend()

    if backend == "mlx":
        import mlx.core as mx

        mx.eval()  # MLX uses lazy evaluation, eval() forces execution
    elif backend == "cupy":
        import cupy as cp

        cp.cuda.Stream.null.synchronize()




[docs]
def get_gpu_memory_info() -> dict[str, Union[str, int]]:
    """
    Get GPU memory usage information.

    Returns
    -------
    dict
        Dictionary with keys:
        - 'backend': Backend in use ("mlx", "cupy", or "numpy")
        - 'free': Free memory in bytes (if available)
        - 'total': Total memory in bytes (if available)
        - 'used': Used memory in bytes (if available)

    Examples
    --------
    >>> from pytcl.gpu.utils import get_gpu_memory_info, is_gpu_available
    >>> if is_gpu_available():
    ...     info = get_gpu_memory_info()
    ...     print(f"Backend: {info['backend']}")
    """
    backend = get_backend()

    if backend == "numpy":
        return {"backend": "numpy", "free": 0, "total": 0, "used": 0}

    if backend == "mlx":
        # MLX doesn't expose memory info directly, but we can get device info
        import mlx.core as mx

        device = mx.default_device()
        return {
            "backend": "mlx",
            "device": str(device),
            "free": -1,  # Not available
            "total": -1,  # Not available
            "used": -1,  # Not available
        }

    # CuPy backend
    import cupy as cp

    mempool = cp.get_default_memory_pool()
    free, total = cp.cuda.Device().mem_info

    return {
        "backend": "cupy",
        "free": free,
        "total": total,
        "used": total - free,
        "pool_used": mempool.used_bytes(),
        "pool_total": mempool.total_bytes(),
    }




[docs]
def clear_gpu_memory() -> None:
    """
    Clear GPU memory pools.

    This frees cached memory blocks held by the GPU backend.
    Call this when you need to free GPU memory for other operations.

    Examples
    --------
    >>> from pytcl.gpu.utils import clear_gpu_memory, is_gpu_available
    >>> if is_gpu_available():
    ...     # ... perform GPU operations ...
    ...     clear_gpu_memory()  # Free cached memory
    """
    backend = get_backend()

    if backend == "mlx":
        import mlx.core as mx

        # MLX has automatic memory management, but we can force a sync
        mx.eval()
        # Note: MLX doesn't have explicit memory pool clearing like CuPy
    elif backend == "cupy":
        import cupy as cp

        mempool = cp.get_default_memory_pool()
        mempool.free_all_blocks()



__all__ = [
    # Platform detection
    "is_apple_silicon",
    "is_mlx_available",
    "is_cupy_available",
    "get_backend",
    # Availability check
    "is_gpu_available",
    # Array operations
    "get_array_module",
    "to_gpu",
    "to_cpu",
    "ensure_gpu_array",
    # Synchronization and memory
    "sync_gpu",
    "get_gpu_memory_info",
    "clear_gpu_memory",
    # Type hints
    "GPUArray",
    "BackendType",
]