Source code for HieDiff._regulons

# -*- coding: utf-8 -*-

from typing import Optional, Sequence

import numpy as np
import pandas as pd
from anndata import AnnData
import os
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell as pyscenic_aucell


[docs]def regulons( adata: AnnData, tf_names: Optional[Sequence[str]] = None, motif_annotations_fname: Optional[str] = None, db_fnames: Optional[Sequence[str]] = None, thresholds=(0.75, 0.90), top_n_targets=(50,), top_n_regulators=(5,10,50), min_genes=20, copy: bool = False, ) -> Optional[AnnData]: ''' Regulon inference for spatial transcriptomics [Aibar17]_. Parameters ---------- adata Annotated data matrix. tf_names List of transcription factors (TFs). motif_annotations_fname Path of motif annotation file. db_fnames List of path of ranking database files. thresholds The first method to create the TF-modules based on the best targets for each transcription factor. top_n_targets The second method is to select the top targets for a given TF. top_n_regulators The alternative way to create the TF-modules is to select the best regulators for each gene. min_genes The required minimum number of genes in a resulting module. copy Return a copy instead of writing to ``adata``. Returns ------- Depending on ``copy``, returns or updates ``adata`` with the following fields. .uns['regulon'] The inferred regulons. ''' adata = adata.copy() if copy else adata neighbors_var_key = 'manifold_var' if neighbors_var_key not in adata.uns: raise ValueError(f'Did not find .uns["{neighbors_var_key}"].') conns_var_key = adata.uns[neighbors_var_key]['connectivities_key'] if conns_var_key not in adata.varp: raise ValueError(f'Did not find .varp["{conns_var_key}"].') neighbors = pd.DataFrame(adata.varp[conns_var_key].toarray(), index=adata.var_names, columns=adata.var_names) if tf_names is None: neighbors = pd.melt(neighbors, ignore_index=False) neighbors.columns = ['target', 'importance'] neighbors = neighbors.reset_index() else: tf_idx = np.flatnonzero(np.isin(adata.var_names, tf_names)) if tf_idx.shape[0] == 0: raise ValueError("Could not find any tf in var_names") neighbors = pd.melt(neighbors.iloc[tf_idx, :], ignore_index=False) neighbors.columns = ['target', 'importance'] neighbors = neighbors.reset_index() neighbors.columns = ['TF', 'target', 'importance'] neighbors = neighbors[neighbors['importance'] > 0] neighbors = neighbors.sort_values(by='importance', ascending=False) neighbors = neighbors.reset_index(drop=True) regulons = modules_from_adjacencies( adjacencies=neighbors, ex_mtx=adata.to_df(), thresholds=thresholds, top_n_targets=top_n_targets, top_n_regulators=top_n_regulators, min_genes=min_genes, rho_dichotomize=False, ) for idx in range(len(regulons)): gene2weight = dict(regulons[idx].gene2weight) gene2weight[regulons[idx].transcription_factor] = regulons[idx].weights[1] regulons[idx] = regulons[idx].copy(gene2weight=gene2weight) if motif_annotations_fname is not None and db_fnames is not None: def name(fname): return os.path.splitext(os.path.basename(fname))[0] dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames] df = prune2df(dbs, regulons, motif_annotations_fname) regulons = df2regulons(df) key_added = 'regulons' adata.uns[key_added] = {} regulons_dict = adata.uns[key_added] regulons_dict['regulons'] = dump(regulons, default_flow_style=False, Dumper=Dumper) regulons_dict['params'] = {} regulons_dict['params']['tf_names'] = tf_names regulons_dict['params']['thresholds'] = str(thresholds) regulons_dict['params']['top_n_targets'] = str(top_n_targets) regulons_dict['params']['top_n_regulators'] = str(top_n_regulators) regulons_dict['params']['min_genes'] = min_genes return adata if copy else None
[docs]def aucell( adata: AnnData, auc_threshold: float = 0.05, noweights: bool = False, normalize: bool = False, random_state: int = 0, num_workers: int = 1, copy: bool = False, ) -> Optional[AnnData]: ''' Compute the regulon activity AUCell matrix [Aibar17]_. Parameters ---------- adata Annotated data matrix. auc_threshold The fraction of the ranked genome to take into account for the calculation of the Area Under the recovery Curve. noweights Should the weights of the genes part of a signature be used in calculation of enrichment? normalize Normalize the AUC values to a maximum of 1.0 per regulon. random_state Change to use different initial states for the optimization. num_workers The number of cores to use. copy Return a copy instead of writing to ``adata``. Returns ------- Depending on ``copy``, returns or updates ``adata`` with the following fields. .obsm['aucell'] : :class:`~pandas.DataFrame` The regulon activity AUCell matrix. ''' adata = adata.copy() if copy else adata key = 'regulons' if key not in adata.uns: raise ValueError(f'Did not find .uns["{key}"].') regulons = load(adata.uns[key]['regulons'], Loader=Loader) auc_mtx = pyscenic_aucell( exp_mtx=adata.to_df(), signatures=regulons, auc_threshold=auc_threshold, noweights=noweights, normalize=normalize, seed=random_state, num_workers=num_workers, ) key_added = 'aucell' adata.uns[key_added] = {} aucell_dict = adata.uns[key_added] aucell_dict['params'] = {} aucell_dict['params']['auc_threshold'] = auc_threshold aucell_dict['params']['noweights'] = noweights aucell_dict['params']['normalize'] = normalize aucell_dict['params']['random_state'] = random_state adata.obsm[key_added] = auc_mtx.loc[adata.obs_names,:] return adata if copy else None