import warnings
import matplotlib.cbook
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)
import yaml
import os
import loompy
import anndata
import numpy as np
import pandas as pd
import FlowCytometryTools
from FlowCytometryTools import FCMeasurement
from sklearn.preprocessing import OneHotEncoder
import torch
[docs]def from_csv_yaml(
csv_input: str,
marker_yaml: str,
design_csv=None,
random_seed=1234,
dtype=torch.float64,
):
"""Create an Astir object from an expression CSV and marker YAML
:param csv_input: Path to input csv containing expression for cells (rows) by proteins (columns). First column is
cell identifier, and additional column names are gene identifiers.
:param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
entries. See documention.
:param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell
identifier, and additional column names are covariate identifiers.
"""
df_gex = pd.read_csv(csv_input, index_col=0)
design = None
if design_csv is not None:
design = pd.read_csv(design_csv, index_col=0)
with open(marker_yaml, "r") as stream:
marker_dict = yaml.safe_load(stream)
from astir.astir import Astir
return Astir(df_gex, marker_dict, design, random_seed, dtype=dtype)
[docs]def from_csv_dir_yaml(
input_dir: str, marker_yaml: str, random_seed=1234, dtype=torch.float64,
):
"""Create an Astir object a directory containing multiple csv files
:param input_dir: Path to a directory containing multiple CSV files, each in the format expected by
`from_csv_yaml`
:param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
entries. See documention.
:param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell
identifier, and additional column names are covariate identifiers.
"""
# TODO: add text explaining concatenation
# Parse the input directory
csv_files = [
os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith("csv")
]
# Read to gene expression df and parse
dfs = [pd.read_csv(f, index_col=0) for f in csv_files]
df_gex = pd.concat(dfs, axis=0)
# Construct a sample specific design matrix
design_list = [np.repeat(str(i), dfs[i].shape[0]) for i in range(len(dfs))]
design = (
OneHotEncoder()
.fit_transform(np.concatenate(design_list, axis=0).reshape(-1, 1))
.todense()
)
design = design[:, :-1] # remove final column
design = np.concatenate(
[np.ones((design.shape[0], 1)), design], axis=1
) # add in intercept!
with open(marker_yaml, "r") as stream:
marker_dict = yaml.safe_load(stream)
from astir.astir import Astir
return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_loompy_yaml(
loom_file: str,
marker_yaml: str,
protein_name_attr: str = "protein",
cell_name_attr: str = "cell_name",
batch_name_attr: str = "batch",
random_seed: int = 1234,
dtype=torch.float64,
):
"""Create an Astir object from a loom file and a marker yaml
:param loom_file: Path to a loom file, where rows correspond to proteins and columns to cells
:param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
entries. See documention.
:param protein_name_attr: The attribute (key) in the row attributes that identifies the protein names
(required to match with the marker gene information)
:param cell_name_attr: The attribute (key) in the column attributes that identifies the name of each cell
:param batch_name_attr: The attribute (key) in the column attributes that identifies the batch. A design matrix
will be built using this (if present) using a one-hot encoding to control for batch.
:param random_seed: The random seed to be used to initialize variables
:returns: An object of class `astir_bash.py.Astir` using data imported from the loom files
"""
# TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved
batch_list = None
with loompy.connect(loom_file) as ds:
df_gex = pd.DataFrame(ds[:, :].T)
df_gex.columns = ds.ra[protein_name_attr]
if cell_name_attr in ds.ca.keys():
df_gex.index = ds.ca[cell_name_attr]
if batch_name_attr in ds.ca.keys():
batch_list = ds.ca[batch_name_attr]
design = None
if batch_list is not None:
design = OneHotEncoder().fit_transform(batch_list.reshape(-1, 1)).todense()
design = design[:, :-1] # remove final column
design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1)
with open(marker_yaml, "r") as stream:
marker_dict = yaml.safe_load(stream)
from astir.astir import Astir
return Astir(df_gex, marker_dict, design, random_seed, dtype)
[docs]def from_anndata_yaml(
anndata_file: str,
marker_yaml: str,
protein_name: str = None,
cell_name: str = None,
batch_name: str = "batch",
random_seed: int = 1234,
dtype=torch.float64,
):
"""Create an Astir object from an :class:`anndata.Anndata` file and a marker yaml
:param anndata_file: Path to an :class:`anndata.Anndata` `h5py` file
:param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
entries. See documention.
:param protein_name: The column of `adata.var` containing protein names. If this is none, defaults to `adata.var_names`
:param cell_name: The column of `adata.obs` containing cell names. If this is none, defaults to `adata.obs_names`
:param batch_name: The column of `adata.obs` containing batch names. A design matrix
will be built using this (if present) using a one-hot encoding to control for batch.
:param random_seed: The random seed to be used to initialize variables
:returns: An object of class `astir_bash.py.Astir` using data imported from the loom files
"""
# TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved
batch_list = None
ad = anndata.read_h5ad(anndata_file)
df_gex = pd.DataFrame(ad.X.toarray())
if protein_name is not None:
df_gex.columns = ad.var[protein_name]
else:
df_gex.columns = ad.var_names
if cell_name is not None:
df_gex.index = ad.obs[cell_name]
else:
df_gex.index = ad.obs_names
if batch_name is not None:
batch_list = ad.obs[batch_name]
design = None
if batch_list is not None:
design = (
OneHotEncoder()
.fit_transform(batch_list.to_numpy().reshape(-1, 1))
.todense()
)
design = design[:, :-1] # remove final column
design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1)
with open(marker_yaml, "r") as stream:
marker_dict = yaml.safe_load(stream)
from astir.astir import Astir
return Astir(df_gex, marker_dict, design, random_seed, dtype)
def from_fcs_yaml(
fcs_file: str, marker_yaml: str, random_seed: int = 1234, dtype=torch.float64,
):
expr_fcs = FCMeasurement(ID="astir_data", datafile=fcs_file)
expr_df = expr_fcs.data
with open(marker_yaml, "r") as stream:
marker_dict = yaml.safe_load(stream)
from astir.astir import Astir
return Astir(expr_df, marker_dict, random_seed, dtype)