import os
from typing import Optional
import anndata
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from ..utils import download, progress
[docs]
@progress("Impute missing values")
def impute_missing_values(adata: anndata.AnnData, strategy: str, logger, indent_level: int = 1) -> None:
"""
Imputes missing values in a given adata object using a specified strategy.
This function handles missing data in by applying various imputation strategies.
It checks the .X in the adata object for missing values and applies the chosen imputation
method, which can be mean, median, constant, or K-nearest neighbors (KNN). The function is
useful in preprocessing steps for datasets where missing data could affect subsequent analyses.
It also adds the number of missing values for each sample and each feature.
Parameters
----------
adata : anndata.AnnData
An adata object containing .X with potential missing values.
strategy : str
The imputation strategy to apply. Valid options are 'mean', 'median', 'constant', and 'knn'.
logger : Logger
A logging object for tracking the progress and outcomes of the function.
indent_level : int, optional
The level of indentation for the logger, with 1 being the default.
Raises
------
ValueError
If an invalid imputation strategy is specified.
Notes
-----
The 'constant' strategy fills missing values with 0 by default. The 'knn' strategy uses
the K-nearest neighbors algorithm to estimate missing values based on similar samples.
This function is particularly useful in datasets where missing values are common, such as
in biological or medical data.
The function ensures that no imputation is performed if there are no missing values in the
dataset, thus preserving the original data integrity.
Examples
--------
>>> imputed_adata = impute_missing_values(adata, "mean")
# Imputes missing values using the mean of each column.
"""
# Add percent of NAs to adata object
adata.var["percent_na"] = np.isnan(adata.X).sum(axis=0) / adata.X.shape[0]
# Check for missing values
if adata.var["percent_na"].sum() == 0:
logger.info("No missing values found. No imputation necessary", indent_level=2)
else:
# Dictionary mapping strategies to imputer objects
imputers = {
"mean": SimpleImputer(strategy="mean", keep_empty_features=True),
"median": SimpleImputer(strategy="median", keep_empty_features=True),
"constant": SimpleImputer(strategy="constant", fill_value=0, keep_empty_features=True),
"knn": KNNImputer(),
}
# Select the appropriate imputer
imputer = imputers.get(strategy)
if not imputer:
raise ValueError(f"Invalid imputer strategy: {strategy}")
logger.info(f"Imputing missing values using {strategy} strategy", indent_level=2)
adata.X = imputer.fit_transform(adata.X)
adata.layers["X_imputed"] = adata.X
[docs]
@progress("Log data statistics")
def log_data_statistics(X: np.ndarray, logger, indent_level: int = 1) -> None:
"""
Logs various statistical properties of a given dataset.
This function provides a quick summary of key statistics for a numpy array. It calculates
and logs the number of observations (rows), features (columns), total missing values, and
the percentage of missing values in the dataset. This function is particularly useful for
initial data exploration and quality assessment in data analysis workflows.
Parameters
----------
X : np.ndarray
A numpy array containing the dataset to be analyzed.
logger : Logger
A logging object for documenting the statistics and observations.
indent_level : int, optional
The level of indentation for the logger, with 1 being the default.
Notes
-----
Understanding the basic statistics of a dataset is crucial in data preprocessing and
analysis. This function highlights potential issues with data, like high levels of missing
values, which could impact subsequent analyses.
The function is designed to work seamlessly with datasets of varying sizes and complexities.
The statistical summary provided helps in making informed decisions about further steps in
data processing, such as imputation or feature selection.
Example
-------
>>> data = np.random.rand(100, 5)
>>> log_data_statistics(data, logger)
# Logs number of observations, features, and details about missing values.
"""
n_obs, n_features = X.shape
total_nas = np.isnan(X).sum()
percent_nas = 100 * total_nas / (n_obs * n_features)
# Log various data statistics
logger.info(f"There are {n_obs} observations", indent_level=2)
logger.info(f"There are {n_features} features", indent_level=2)
logger.info(f"Total missing values: {total_nas}", indent_level=2)
logger.info(f"Percentage of missing values: {percent_nas:.2f}%", indent_level=2)
[docs]
@progress("Create anndata object")
def create_anndata_object(
df: pd.DataFrame,
logger,
indent_level: int = 1,
) -> anndata.AnnData:
"""
Creates an AnnData object from a pandas DataFrame.
This function constructs an AnnData object, a central data structure for storing and
manipulating high-dimensional biological data such as single-cell genomics data. It takes
a pandas DataFrame and returns an AnnData object suitable for downstream analyses
in bioinformatics pipelines.
Parameters
----------
df : pd.DataFrame
A pandas DataFrame with sample names as the index and the feature names as columns.
logger : Logger
A logging object for documenting the process and any relevant observations.
indent_level : int, optional
The level of indentation for the logger, with 1 being the default.
Returns
-------
anndata.AnnData
An AnnData object populated with the data, observation names, and variable names.
Notes
-----
AnnData objects are widely used in computational biology for storing large, annotated
datasets. Their structured format ensures easy access and manipulation of data for
various analytical purposes.
This function is essential for converting raw or processed data into a format readily
usable with tools and libraries that support AnnData objects, facilitating a seamless
integration into existing bioinformatics workflows.
Example
-------
>>> data = pd.DataFrame(np.random.rand(100, 5))
>>> ann_data = create_anndata_object(data, logger)
# Creates an AnnData object with 100 observations and 5 variables.
"""
# Identify columns with only NAs and store the boolean series
na_column_mask = df.isna().all()
# Calculate the number of columns with only NAs directly
num_columns_dropped = na_column_mask.sum()
if num_columns_dropped > 0:
# Extract column names with only NAs
columns_with_nas = df.columns[na_column_mask]
# Prepare a snippet of column names for logging (max 3)
sample_columns = columns_with_nas[: min(3, len(columns_with_nas))].tolist()
logger.warning(
f"Dropping {num_columns_dropped} columns with only NAs: {sample_columns}, etc.",
indent_level=indent_level + 1,
)
# Drop columns with only NAs
df = df.drop(columns=columns_with_nas)
# Extract information from df
X = df.values
obs_names = df.index.astype(str)
var_names = df.columns.astype(str)
# Check for duplicate features
if len(np.unique(var_names)) != len(var_names):
logger.error("There are duplicate feature names!")
raise ValueError
obs = pd.DataFrame(index=obs_names)
var = pd.DataFrame(index=var_names)
adata = anndata.AnnData(X=X, obs=obs, var=var, layers={"X_original": X})
return adata
[docs]
@progress("Add imputer strategy to adata.uns")
def add_unstructured_data(adata: anndata.AnnData, imputer_strategy: str, logger, indent_level: int = 1) -> None:
"""
Adds unstructured data, such as imputer strategy and data type, to an AnnData object.
This function is designed to annotate an AnnData object with additional unstructured
information, enhancing data transparency and traceability. Key information, like the
imputation strategy used and the type of biological data represented, is stored in the
unstructured (uns) attribute of the AnnData object. This enrichment is vital for ensuring
clarity and reproducibility in bioinformatics analyses.
Parameters
----------
adata : anndata.AnnData
The AnnData object to which the unstructured data will be added.
imputer_strategy : str
The strategy used for imputing missing values in the dataset, which will be recorded
in the AnnData object for reference.
logger : Logger
A logging object for documenting the process and any important observations.
indent_level : int, optional
The level of indentation for the logger, with 1 being the default.
Notes
-----
This function updates the 'uns' attribute of the AnnData object with the 'imputer_strategy'
key.
Example
-------
>>> from anndata import AnnData
>>> adata = AnnData(np.random.rand(5, 3))
>>> adata = add_unstructured_data(adata, "mean", logger)
# This will add the imputer strategy 'mean' and the data type 'dna_methylation' to the AnnData object.
"""
# Add imputer strategy and data type to the AnnData object
adata.uns["imputer_strategy"] = imputer_strategy