Source code for pyaging.predict._preprocessing

import numpy as np



[docs]
def scale(x, scaler):
    """
    Scales the input data using the provided scaler.
    """
    # Apply the scaling transformation to the NumPy array
    x_scaled = scaler.transform(x)

    return x_scaled




[docs]
def scale_with_gold_standard(x, column_means, column_stds):
    """
    Scales the input data per column given means and standard deviations.
    """
    # Ensure column_stds is a numpy array
    column_stds = np.array(column_stds)

    # Avoid division by zero in case of a column with constant value
    column_stds[np.abs(column_stds) < 10e-10] = 1

    x_scaled = (x - column_means) / column_stds
    return x_scaled




[docs]
def scale_row(x, x_overlap):
    """
    Scales the input data per row with mean 0 and std 1.
    """
    row_means = np.mean(x_overlap, axis=1, keepdims=True)
    row_stds = np.std(x_overlap, axis=1, keepdims=True)

    # Avoid division by zero in case of a row with constant value
    row_stds[row_stds == 0] = 1

    x_scaled = (x - row_means) / row_stds
    return x_scaled




[docs]
def binarize(x):
    """
    Binarizes an array based on the median of each row, excluding zeros.
    """

    # Create a mask for non-zero elements
    non_zero_mask = x != 0

    # Apply mask, calculate median for each row, and change data
    for i, row in enumerate(x):
        non_zero_elements = row[non_zero_mask[i]]
        x[i] = x[i] > np.median(non_zero_elements)

    return x




[docs]
def tpm_norm_log1p(x, lengths):
    """
    Normalize an array of counts to TPM (Transcripts Per Million) then
    transforms with log1p.
    """
    # Normalize by length
    tpm = 1000 * (x / lengths)

    # Scale to TPM (Transcripts Per Million)
    tpm = 1e6 * (tpm / tpm.sum(axis=1, keepdims=True))

    tpm_log1p = np.log1p(tpm)

    return tpm_log1p




[docs]
def quantile_normalize_with_gold_standard(x, gold_standard_means):
    """
    Apply quantile normalization on x using gold standard means.
    """
    # Create a copy of x to avoid modifying a view
    x_normalized = x.copy()

    # Sort the gold standard means
    sorted_gold_standard = np.sort(gold_standard_means)

    # Iterate through each row in x_normalized
    for i in range(x_normalized.shape[0]):
        # Sort the row data and store the original indices
        sorted_indices = np.argsort(x_normalized[i, :])
        sorted_data = x_normalized[i, sorted_indices]

        # Map the sorted data to their quantile values in the gold standard
        quantile_indices = np.round(np.linspace(0, len(sorted_gold_standard) - 1, len(sorted_data))).astype(int)
        normalized_data = sorted_gold_standard[quantile_indices]

        # Re-order the normalized data to the original order
        original_order_indices = np.argsort(sorted_indices)
        x_normalized[i, :] = normalized_data[original_order_indices]

    return x_normalized