diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..3a13d655600036352f11d68507d7526dcce506b6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 scVAR + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index e3d39587d9720c592074f0d2549f9eee622e24c3..c0418582e3d838a1866fa858e27b7ee398ad2d63 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ -# Scvar +# EXAMPLE +This is scVAR + +A nice software for integrating scRNA-seq and genomic data +# scVar diff --git a/scVAR/__init__.py b/scVAR/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scVAR/scVAR.py b/scVAR/scVAR.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae15e34788db87a8f4e68c41952af88fd344920 --- /dev/null +++ b/scVAR/scVAR.py @@ -0,0 +1,322 @@ +# import packs +import numpy as np +import pandas as pd +import scanpy as sc +import gc +import torch +import torch.nn.functional as F +import umap +import anndata +from sklearn.preprocessing import StandardScaler +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.metrics import normalized_mutual_info_score +from sklearn.feature_extraction.text import TfidfTransformer +from scipy import io +from scipy import sparse +import matplotlib.pyplot as plt + + +### FUNCTION ### +################ + +def transcriptomicAnalysis(path_10x, bcode_variants, high_var_genes_min_mean=0.015, high_var_genes_max_mean=3, high_var_genes_min_disp=0.75, + min_genes=200, min_cells=3, max_pct_mt=100, n_neighbors=20, n_pcs=30, transcript_key='trans', manual_matrix_reading = False): + # load sparse matrix + if not manual_matrix_reading: + adata = sc.read_10x_mtx(path=path_10x, make_unique=True) + else: + ######## alternativa per PTXX and GLIO + # load sparse matrix + rna = path_10x+'matrix.mtx.gz' + barcode = path_10x+'barcodes.tsv.gz' + feat = path_10x+'features.tsv.gz' + rna = io.mmread(rna) + B = rna.todense() + B = B.T + barcode = pd.read_csv(barcode, sep = '\t', header=None, names=['bcode']) + barcode.index=barcode.iloc[:,0] + barcode = barcode.drop('bcode', axis=1) + feat = pd.read_csv(feat, sep = '\t', header=None, names = ['gene_ids', 'gene_symbol', 'feature_types']) + feat.index=feat['gene_symbol'] + feat = feat.drop('gene_symbol', axis=1) + adata = anndata.AnnData(X=B, obs=barcode, var=feat) + adata.X = sparse.csr_matrix(adata.X) + ################ + bcode_var = pd.read_csv(bcode_variants, sep = '\t', header=None)[0] + adata = adata[adata.obs.index.isin(bcode_var)] + + # use Scanpy to preprocess data + sc.pp.filter_cells(adata, min_genes=min_genes) + sc.pp.filter_genes(adata, min_cells=min_cells) + adata.var['mt'] = adata.var_names.str.startswith('MT-') + sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) + adata = adata[adata.obs.pct_counts_mt < max_pct_mt, :] + + sc.pp.normalize_total(adata, target_sum=1e4) + sc.pp.log1p(adata) + sc.pp.highly_variable_genes(adata, min_mean=high_var_genes_min_mean, max_mean=high_var_genes_max_mean, min_disp=high_var_genes_min_disp) + sc.pp.pca(adata, n_comps=n_pcs, use_highly_variable=True) + sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, random_state=42) + sc.tl.umap(adata, random_state=42) + adata.obsm[transcript_key+'_umap'] = adata.obsm['X_umap'] + + # Scale data + scaler = StandardScaler() + adata.uns[transcript_key+'_X'] = scaler.fit_transform(adata.X.toarray()) + + return(adata) + + +def variantAnalysis(adata, matrix_path, bcode_path, variants_path, var_filter=0.2, n_neighbors=20, n_pcs=30,variant_key='variant'): + # load sparse matrix + var = matrix_path + barcode = bcode_path + snv = variants_path + var = io.mmread(var) + var = var.todense() + barcode = pd.read_csv(barcode, sep = '\t', header=None)[0] + snv = pd.read_csv(snv, sep = '\t', header=None)[0] + var = pd.DataFrame(var, snv, barcode) + var = var.T + + # filter col + var = var.loc[list(adata.obs.index) , ] + col_sum = var.sum() + nrow = len(var) + selected_var = var.columns[col_sum/nrow > var_filter] + selected_var = selected_var.tolist() + var = var.loc[:, selected_var] # filtro prima per le cellule rimaste nella trascrittomica e poi per le varianit trascritte + + # Scaling + scaler = StandardScaler() + var_X = scaler.fit_transform(var) + + # find umap variant_based + adata.uns[variant_key+'_orig_mat'] = var + adata.uns[variant_key+'_X'] = var_X + adata.obsm[variant_key+'_pca'] = sc.pp.pca(adata.uns[variant_key+'_X']) + sc.pp.neighbors(adata, use_rep=variant_key+'_pca', n_neighbors=n_neighbors, n_pcs=n_pcs, key_added=variant_key+'_neighbors', random_state=42) + umap_data = sc.tl.umap(adata, neighbors_key=variant_key+'_neighbors', copy=True, random_state=42) + adata.obsm[variant_key+'_umap'] = umap_data.obsm['X_umap'] + return(adata) + + +def calcOmicsClusters(adata, omic_key, res=0.5, n_neighbors=None, n_pcs=None): + if n_neighbors is None: + n_neighbors=adata.uns['neighbors']['params']['n_neighbors'] + if n_pcs is None: + n_pcs=adata.varm['PCs'].shape[1] + neig_name = omic_key+'_'+str(n_neighbors)+'_neighbors_'+str(n_pcs)+'_PCs' + sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_rep=omic_key+'_umap', key_added=neig_name) + sc.tl.leiden(adata,resolution=res, key_added=omic_key+'_clust_'+str(res), neighbors_key=neig_name) + return(adata) + + +def weightsInit(m): + if isinstance(m, torch.nn.Linear): + torch.nn.init.xavier_uniform(m.weight.data) + m.bias.data.zero_() + + +def omicsIntegration(adata, transcript_key='trans', variant_key='variant', clf_out=25, integration_key='int'): # forse clf_out andrebbe settato con il numero di tipi cellulari che so essere presenti + # Use integration + rna_X = adata.uns[transcript_key+'_X'] + var_X = adata.uns[variant_key+'_X'] + net = pairedIntegration(input_dim_a = rna_X.shape[1], input_dim_b = var_X.shape[1], clf_out=clf_out) + net.apply(weightsInit) + # Training process + pairedIntegrationTrainer(X_a = rna_X, X_b = var_X, model = net, num_epoch=50) + # Integrates visualization + net.to(torch.device("cpu")) + X_all_tensor_a = torch.tensor(rna_X).float() + X_all_tensor_b = torch.tensor(var_X).float() + + y_pred_a = net.encoder_a(X_all_tensor_a) + y_pred_a = F.normalize(y_pred_a, dim=1,p=2) + y_pred_a = torch.Tensor.cpu(y_pred_a).detach().numpy() + + y_pred_b = net.encoder_b(X_all_tensor_b) + y_pred_b = F.normalize(y_pred_b, dim=1,p=2) + y_pred_b = torch.Tensor.cpu(y_pred_b).detach().numpy() + + # Coord Mean + y_pred = np.concatenate((y_pred_a, y_pred_b),axis=0) + umaps = umap.UMAP(n_neighbors=5, min_dist=0.0, n_components=2, metric="euclidean", random_state=42).fit(y_pred) + embedding = umaps.transform(y_pred) + embedding = pd.DataFrame(embedding) + embedding.columns=['UMAP1','UMAP2'] + emb = embedding.iloc[:,:2].values + + rna_coord = emb[slice(0,int(emb.shape[0]/2))] + variants_coord = emb[slice(int(emb.shape[0]/2),int(emb.shape[0]))] + + new_umap = list() + for i in range(0,rna_coord.shape[0]): + X_rna = rna_coord[i][0] + X_var = variants_coord[i][0] + coord_X = [X_rna, X_var] + coord_X = np.mean(coord_X) + + Y_rna = rna_coord[i][1] + Y_var = variants_coord[i][1] + coord_Y = [Y_rna, Y_var] + coord_Y = np.mean(coord_Y) + + coord = [coord_X, coord_Y] + new_umap.append(coord) + + new_umap = np.array(new_umap) + adata.obsm[integration_key+'_umap'] = new_umap + + return(adata) + + + +class pairedIntegration(torch.nn.Module): + def __init__(self,input_dim_a=2000,input_dim_b=2000,clf_out=10): + super(pairedIntegration, self).__init__() + self.input_dim_a = input_dim_a + self.input_dim_b = input_dim_b + self.clf_out = clf_out + self.encoder_a = torch.nn.Sequential( + torch.nn.Linear(self.input_dim_a, 1000), + torch.nn.BatchNorm1d(1000), + torch.nn.ReLU(), + torch.nn.Linear(1000, 512), + torch.nn.BatchNorm1d(512), + torch.nn.ReLU(), + torch.nn.Linear(512, 128), + torch.nn.BatchNorm1d(128), + torch.nn.ReLU()) + self.encoder_b = torch.nn.Sequential( + torch.nn.Linear(self.input_dim_b, 1000), + torch.nn.BatchNorm1d(1000), + torch.nn.ReLU(), + torch.nn.Linear(1000, 512), + torch.nn.BatchNorm1d(512), + torch.nn.ReLU(), + torch.nn.Linear(512, 128), + torch.nn.BatchNorm1d(128), + torch.nn.ReLU()) + self.clf = torch.nn.Sequential( + torch.nn.Linear(128, self.clf_out), + torch.nn.Softmax(dim=1)) + self.feature = torch.nn.Sequential( + torch.nn.Linear(128, 32)) + + def forward(self, x_a,x_b): + out_a = self.encoder_a(x_a) + f_a = self.feature(out_a) + y_a = self.clf(out_a) + + out_b = self.encoder_b(x_b) + f_b = self.feature(out_b) + y_b = self.clf(out_b) + return f_a,y_a,f_b,y_b + +def pairedIntegrationTrainer(X_a, X_b, model, batch_size = 512, num_epoch=5, + f_temp = 0.1, p_temp = 1.0): + + device = torch.device("cpu") + f_con = contrastiveLoss(batch_size = batch_size,temperature = f_temp) + p_con = contrastiveLoss(batch_size = model.clf_out,temperature = p_temp) + opt = torch.optim.SGD(model.parameters(),lr=0.01, momentum=0.9,weight_decay=5e-4) + + for k in range(num_epoch): + + model.to(device) + n = X_a.shape[0] + r = np.random.permutation(n) + X_train_a = X_a[r,:] + X_tensor_A=torch.tensor(X_train_a).float() + X_train_b = X_b[r,:] + X_tensor_B=torch.tensor(X_train_b).float() + + losses = 0 + + for j in range(n//batch_size): + inputs_a = X_tensor_A[j*batch_size:(j+1)*batch_size,:].to(device) + inputs_a2 = inputs_a + torch.normal(0,1,inputs_a.shape).to(device) + inputs_a = inputs_a + torch.normal(0,1,inputs_a.shape).to(device) + + inputs_b = X_tensor_B[j*batch_size:(j+1)*batch_size,:].to(device) + inputs_b = inputs_b + torch.normal(0,1,inputs_b.shape).to(device) + + feas,o,nfeas,no = model(inputs_a,inputs_b) + feas2,o2,_,_ = model(inputs_a2,inputs_b) + + fea_mi = f_con(feas,nfeas)+f_con(feas,feas2) + p_mi = p_con(o.T,no.T)+p_con(o.T,o2.T) + + loss = fea_mi + p_mi + + opt.zero_grad() + loss.backward() + opt.step() + + losses += loss.data.tolist() + print("Total loss: "+str(round(losses,4))) + gc.collect() + + + +class contrastiveLoss(torch.nn.Module): + def __init__(self, batch_size, temperature=0.5): + super().__init__() + self.batch_size = batch_size + self.register_buffer("temperature", torch.tensor(temperature)) + self.register_buffer("negatives_mask", (~torch.eye(batch_size * 2, batch_size * 2, + dtype=bool)).float()) + + def forward(self, emb_i, emb_j): +# """ +# emb_i and emb_j are batches of embeddings, where corresponding indices are pairs +# z_i, z_j as per SimCLR paper +# """ + z_i = F.normalize(emb_i, dim=1,p=2) + z_j = F.normalize(emb_j, dim=1,p=2) + + representations = torch.cat([z_i, z_j], dim=0) + similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2) + + sim_ij = torch.diag(similarity_matrix, self.batch_size) + sim_ji = torch.diag(similarity_matrix, -self.batch_size) + positives = torch.cat([sim_ij, sim_ji], dim=0) + + nominator = torch.exp(positives / self.temperature) + denominator = self.negatives_mask * torch.exp(similarity_matrix / self.temperature) + + loss_partial = -torch.log(nominator / torch.sum(denominator, dim=1)) + loss = torch.sum(loss_partial) / (2 * self.batch_size) + return loss + + +def distributionClusters(adata, control_cl, group_cl, perc_cell_to_show=0.1, figsize = (12,8), dpi=100, save_path=None): + df = adata.obs.groupby([group_cl, control_cl]).size().unstack() + df = df.loc[df.sum(axis=1)/df.values.sum()>=perc_cell_to_show,:] # remove row if group_cluster represents less cells in perc than perc_cell_to_show. + df_rel = df + df = df.div(df.sum(axis=1), axis=0) + df[group_cl] = df.index + + plt.rcParams["figure.figsize"] = figsize + plt.rcParams['figure.dpi'] = dpi + plt.rcParams['figure.facecolor'] = '#FFFFFF' + df.plot( + x = group_cl, + kind = 'barh', + stacked = True, + mark_right = True, + ) + leg = plt.legend(bbox_to_anchor=(1, 1), loc="upper left") + leg.get_frame().set_edgecolor('black') + plt.xlabel('perc_'+control_cl, loc='center') + for n in df_rel: + for i, (pos_x, ab, value) in enumerate(zip(df.iloc[:, :-1].cumsum(1)[n], df[n], df_rel[n])): + if (value == 0) | (ab <=0.05): + value = '' + plt.text(pos_x-ab/2, i, str(value), va='center', ha='center') + plt.grid(False) + + if save_path is not None: + plt.savefig(save_path, bbox_inches='tight') + return(plt.show()) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..191dae60d27be837543d8d1ea681ecb990fd2da2 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() +setuptools.setup( + name="scVAR", + version="0.0.1", + author="Samuele Manessi", + author_email="samuele.manessi@itb.cnr.it", + description="A tool to integrate genomics and transcriptomics in scRNA-seq data.", + long_description=long_description, + long_description_content_type="text/markdown", + #url="https://github.com/ILTUOACCOUNT/ILTUOPACCHETTO", + packages=setuptools.find_packages(include=['scVAR', 'scVAR.*']), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + install_requires = [ + 'numpy', + 'pandas', + 'scanpy', + #'gc', + 'torch', + 'umap', + 'anndata', + 'sklearn', + 'scipy', + 'matplotlib' + ], + python_requires='>=3.7', +)