Source code for pyaging.predict._pred

import gc

import anndata

from ._pred_utils import *



[docs]
def predict_age(
    adata: anndata.AnnData,
    clock_names: str = "horvath2013",
    dir: str = "pyaging_data",
    batch_size: int = 1024,
    clean: bool = True,
    verbose: bool = True,
) -> anndata.AnnData:
    """
    Predicts biological age using specified aging clocks.

    This function takes an AnnData object and applies one or more specified aging
    clock models to predict the biological age of the samples. It handles the entire pipeline from data
    preprocessing, model loading, prediction, to postprocessing. It also enriches the input AnnData
    object with the predicted ages and relevant clock metadata.

    Parameters
    ----------
    adata: AnnData
        An AnnData object. The object should have .X attribute for the
        data matrix and .var_names for feature names.

    clock_names: str or list of str, optional
        Names of the aging clocks to be applied. It can be a single clock name as a string or a list
        of clock names, by default "horvath2013".

    dir: str
        Retained for backward compatibility. Hugging Face files use its standard cache.

    batch_size: int
        The batch size for age inferece. Defaults to 1024.

    clean: bool
        Whether to delete the matrix data create for each clock in adata.obsm[X_clock]. Defaults to True.

    verbose: bool
        Whether to log the output to console with the logger. Defaults to True.

    Returns
    -------
    AnnData
        The input AnnData object enriched with the predicted ages and clock metadata in the .obs and
        .uns attributes, respectively.

    Notes
    -----
    The function is designed to be flexible and can handle both single and multiple clock predictions.
    The predicted ages are appended to the .obs attribute of the AnnData object with the clock name as
    the key. The metadata of each clock used in the prediction is stored in the .uns attribute. Change
    batch size depending on memory constraints.

    It is important that the input AnnData object's .X attribute contains data suitable for age
    prediction.

    The function automatically handles the transfer of data and models to the appropriate compute
    device (CPU or GPU) based on system configuration.

    Examples
    --------
    >>> adata = anndata.read_h5ad("sample_data.h5ad")
    >>> adata = predict_age(adata, clock_names=["horvath2013", "hannum"])
    >>> adata.obs["horvath2013"]  # Access predicted ages by clock name

    """
    logger = LoggerManager.gen_logger("predict_age")
    if not verbose:
        silence_logger("predict_age")
    logger.first_info("Starting predict_age function")

    # Ensure clock_names is a list with lowercase names
    if isinstance(clock_names, str):
        clock_names = [clock_names]
    clock_names = [clock_name.lower() for clock_name in clock_names]

    # Set device for PyTorch operations
    device = set_torch_device(logger)

    for clock_name in clock_names:
        logger.info(f"🕒 Processing clock: {clock_name}", indent_level=1)

        # Load and prepare the clock
        model = load_clock(clock_name, device, dir, logger, indent_level=2)

        # Disclaimer for commercial clocks
        if model.metadata.get("research_only", False):  # Defaults to False if 'research_only' key doesn't exist
            logger.warning(
                f"⚠️ Clock '{clock_name}' is for research purposes only. Please check the clock's documentation or notes for more information.",
                indent_level=2,
            )

        # Check and update adata for missing features
        check_features_in_adata(
            adata,
            model,
            logger,
            indent_level=2,
        )

        # Perform age prediction using the model applying preprocessing and postprocessing steps
        predicted_ages_tensor = predict_ages_with_model(adata, model, device, batch_size, logger, indent_level=2)

        # Add predicted ages and clock metadata to adata
        add_pred_ages_and_clock_metadata_adata(adata, model, predicted_ages_tensor, dir, logger, indent_level=2)

        # Delete the clock matrix object
        if clean:
            del adata.obsm[f"X_{clock_name}"]

        # Flush memory
        gc.collect()
        torch.cuda.empty_cache()

    logger.done()