Source code for astir.data.data_readers

import warnings
import matplotlib.cbook

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)

import yaml
import os
import loompy
import anndata

import numpy as np
import pandas as pd

import FlowCytometryTools
from FlowCytometryTools import FCMeasurement
from sklearn.preprocessing import OneHotEncoder
import torch


[docs]def from_csv_yaml( csv_input: str, marker_yaml: str, design_csv=None, random_seed=1234, dtype=torch.float64, ): """Create an Astir object from an expression CSV and marker YAML :param csv_input: Path to input csv containing expression for cells (rows) by proteins (columns). First column is cell identifier, and additional column names are gene identifiers. :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers. """ df_gex = pd.read_csv(csv_input, index_col=0) design = None if design_csv is not None: design = pd.read_csv(design_csv, index_col=0) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype=dtype)
[docs]def from_csv_dir_yaml( input_dir: str, marker_yaml: str, random_seed=1234, dtype=torch.float64, ): """Create an Astir object a directory containing multiple csv files :param input_dir: Path to a directory containing multiple CSV files, each in the format expected by `from_csv_yaml` :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers. """ # TODO: add text explaining concatenation # Parse the input directory csv_files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith("csv") ] # Read to gene expression df and parse dfs = [pd.read_csv(f, index_col=0) for f in csv_files] df_gex = pd.concat(dfs, axis=0) # Construct a sample specific design matrix design_list = [np.repeat(str(i), dfs[i].shape[0]) for i in range(len(dfs))] design = ( OneHotEncoder() .fit_transform(np.concatenate(design_list, axis=0).reshape(-1, 1)) .todense() ) design = design[:, :-1] # remove final column design = np.concatenate( [np.ones((design.shape[0], 1)), design], axis=1 ) # add in intercept! with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_loompy_yaml( loom_file: str, marker_yaml: str, protein_name_attr: str = "protein", cell_name_attr: str = "cell_name", batch_name_attr: str = "batch", random_seed: int = 1234, dtype=torch.float64, ): """Create an Astir object from a loom file and a marker yaml :param loom_file: Path to a loom file, where rows correspond to proteins and columns to cells :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name_attr: The attribute (key) in the row attributes that identifies the protein names (required to match with the marker gene information) :param cell_name_attr: The attribute (key) in the column attributes that identifies the name of each cell :param batch_name_attr: The attribute (key) in the column attributes that identifies the batch. A design matrix will be built using this (if present) using a one-hot encoding to control for batch. :param random_seed: The random seed to be used to initialize variables :returns: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None with loompy.connect(loom_file) as ds: df_gex = pd.DataFrame(ds[:, :].T) df_gex.columns = ds.ra[protein_name_attr] if cell_name_attr in ds.ca.keys(): df_gex.index = ds.ca[cell_name_attr] if batch_name_attr in ds.ca.keys(): batch_list = ds.ca[batch_name_attr] design = None if batch_list is not None: design = OneHotEncoder().fit_transform(batch_list.reshape(-1, 1)).todense() design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_anndata_yaml( anndata_file: str, marker_yaml: str, protein_name: str = None, cell_name: str = None, batch_name: str = "batch", random_seed: int = 1234, dtype=torch.float64, ): """Create an Astir object from an :class:`anndata.Anndata` file and a marker yaml :param anndata_file: Path to an :class:`anndata.Anndata` `h5py` file :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name: The column of `adata.var` containing protein names. If this is none, defaults to `adata.var_names` :param cell_name: The column of `adata.obs` containing cell names. If this is none, defaults to `adata.obs_names` :param batch_name: The column of `adata.obs` containing batch names. A design matrix will be built using this (if present) using a one-hot encoding to control for batch. :param random_seed: The random seed to be used to initialize variables :returns: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None ad = anndata.read_h5ad(anndata_file) df_gex = pd.DataFrame(ad.X.toarray()) if protein_name is not None: df_gex.columns = ad.var[protein_name] else: df_gex.columns = ad.var_names if cell_name is not None: df_gex.index = ad.obs[cell_name] else: df_gex.index = ad.obs_names if batch_name is not None: batch_list = ad.obs[batch_name] design = None if batch_list is not None: design = ( OneHotEncoder() .fit_transform(batch_list.to_numpy().reshape(-1, 1)) .todense() ) design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
def from_fcs_yaml( fcs_file: str, marker_yaml: str, random_seed: int = 1234, dtype=torch.float64, ): expr_fcs = FCMeasurement(ID="astir_data", datafile=fcs_file) expr_df = expr_fcs.data with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(expr_df, marker_dict, random_seed, dtype)