Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
custom
scVAR
Commits
c4cc0487
Commit
c4cc0487
authored
Apr 01, 2025
by
Ivan Merelli
Browse files
uploaded software
parent
7bcc2d92
Changes
5
Hide whitespace changes
Inline
Side-by-side
LICENSE
0 → 100644
View file @
c4cc0487
MIT License
Copyright (c) 2022 scVAR
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
README.md
View file @
c4cc0487
#
Scvar
#
EXAMPLE
This is scVAR
A nice software for integrating scRNA-seq and genomic data
# scVar
scVAR/__init__.py
0 → 100644
View file @
c4cc0487
scVAR/scVAR.py
0 → 100644
View file @
c4cc0487
# import packs
import
numpy
as
np
import
pandas
as
pd
import
scanpy
as
sc
import
gc
import
torch
import
torch.nn.functional
as
F
import
umap
import
anndata
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.metrics.cluster
import
adjusted_rand_score
from
sklearn.metrics
import
normalized_mutual_info_score
from
sklearn.feature_extraction.text
import
TfidfTransformer
from
scipy
import
io
from
scipy
import
sparse
import
matplotlib.pyplot
as
plt
### FUNCTION ###
################
def
transcriptomicAnalysis
(
path_10x
,
bcode_variants
,
high_var_genes_min_mean
=
0.015
,
high_var_genes_max_mean
=
3
,
high_var_genes_min_disp
=
0.75
,
min_genes
=
200
,
min_cells
=
3
,
max_pct_mt
=
100
,
n_neighbors
=
20
,
n_pcs
=
30
,
transcript_key
=
'trans'
,
manual_matrix_reading
=
False
):
# load sparse matrix
if
not
manual_matrix_reading
:
adata
=
sc
.
read_10x_mtx
(
path
=
path_10x
,
make_unique
=
True
)
else
:
######## alternativa per PTXX and GLIO
# load sparse matrix
rna
=
path_10x
+
'matrix.mtx.gz'
barcode
=
path_10x
+
'barcodes.tsv.gz'
feat
=
path_10x
+
'features.tsv.gz'
rna
=
io
.
mmread
(
rna
)
B
=
rna
.
todense
()
B
=
B
.
T
barcode
=
pd
.
read_csv
(
barcode
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
'bcode'
])
barcode
.
index
=
barcode
.
iloc
[:,
0
]
barcode
=
barcode
.
drop
(
'bcode'
,
axis
=
1
)
feat
=
pd
.
read_csv
(
feat
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
'gene_ids'
,
'gene_symbol'
,
'feature_types'
])
feat
.
index
=
feat
[
'gene_symbol'
]
feat
=
feat
.
drop
(
'gene_symbol'
,
axis
=
1
)
adata
=
anndata
.
AnnData
(
X
=
B
,
obs
=
barcode
,
var
=
feat
)
adata
.
X
=
sparse
.
csr_matrix
(
adata
.
X
)
################
bcode_var
=
pd
.
read_csv
(
bcode_variants
,
sep
=
'
\t
'
,
header
=
None
)[
0
]
adata
=
adata
[
adata
.
obs
.
index
.
isin
(
bcode_var
)]
# use Scanpy to preprocess data
sc
.
pp
.
filter_cells
(
adata
,
min_genes
=
min_genes
)
sc
.
pp
.
filter_genes
(
adata
,
min_cells
=
min_cells
)
adata
.
var
[
'mt'
]
=
adata
.
var_names
.
str
.
startswith
(
'MT-'
)
sc
.
pp
.
calculate_qc_metrics
(
adata
,
qc_vars
=
[
'mt'
],
percent_top
=
None
,
log1p
=
False
,
inplace
=
True
)
adata
=
adata
[
adata
.
obs
.
pct_counts_mt
<
max_pct_mt
,
:]
sc
.
pp
.
normalize_total
(
adata
,
target_sum
=
1e4
)
sc
.
pp
.
log1p
(
adata
)
sc
.
pp
.
highly_variable_genes
(
adata
,
min_mean
=
high_var_genes_min_mean
,
max_mean
=
high_var_genes_max_mean
,
min_disp
=
high_var_genes_min_disp
)
sc
.
pp
.
pca
(
adata
,
n_comps
=
n_pcs
,
use_highly_variable
=
True
)
sc
.
pp
.
neighbors
(
adata
,
n_neighbors
=
n_neighbors
,
n_pcs
=
n_pcs
,
random_state
=
42
)
sc
.
tl
.
umap
(
adata
,
random_state
=
42
)
adata
.
obsm
[
transcript_key
+
'_umap'
]
=
adata
.
obsm
[
'X_umap'
]
# Scale data
scaler
=
StandardScaler
()
adata
.
uns
[
transcript_key
+
'_X'
]
=
scaler
.
fit_transform
(
adata
.
X
.
toarray
())
return
(
adata
)
def
variantAnalysis
(
adata
,
matrix_path
,
bcode_path
,
variants_path
,
var_filter
=
0.2
,
n_neighbors
=
20
,
n_pcs
=
30
,
variant_key
=
'variant'
):
# load sparse matrix
var
=
matrix_path
barcode
=
bcode_path
snv
=
variants_path
var
=
io
.
mmread
(
var
)
var
=
var
.
todense
()
barcode
=
pd
.
read_csv
(
barcode
,
sep
=
'
\t
'
,
header
=
None
)[
0
]
snv
=
pd
.
read_csv
(
snv
,
sep
=
'
\t
'
,
header
=
None
)[
0
]
var
=
pd
.
DataFrame
(
var
,
snv
,
barcode
)
var
=
var
.
T
# filter col
var
=
var
.
loc
[
list
(
adata
.
obs
.
index
)
,
]
col_sum
=
var
.
sum
()
nrow
=
len
(
var
)
selected_var
=
var
.
columns
[
col_sum
/
nrow
>
var_filter
]
selected_var
=
selected_var
.
tolist
()
var
=
var
.
loc
[:,
selected_var
]
# filtro prima per le cellule rimaste nella trascrittomica e poi per le varianit trascritte
# Scaling
scaler
=
StandardScaler
()
var_X
=
scaler
.
fit_transform
(
var
)
# find umap variant_based
adata
.
uns
[
variant_key
+
'_orig_mat'
]
=
var
adata
.
uns
[
variant_key
+
'_X'
]
=
var_X
adata
.
obsm
[
variant_key
+
'_pca'
]
=
sc
.
pp
.
pca
(
adata
.
uns
[
variant_key
+
'_X'
])
sc
.
pp
.
neighbors
(
adata
,
use_rep
=
variant_key
+
'_pca'
,
n_neighbors
=
n_neighbors
,
n_pcs
=
n_pcs
,
key_added
=
variant_key
+
'_neighbors'
,
random_state
=
42
)
umap_data
=
sc
.
tl
.
umap
(
adata
,
neighbors_key
=
variant_key
+
'_neighbors'
,
copy
=
True
,
random_state
=
42
)
adata
.
obsm
[
variant_key
+
'_umap'
]
=
umap_data
.
obsm
[
'X_umap'
]
return
(
adata
)
def
calcOmicsClusters
(
adata
,
omic_key
,
res
=
0.5
,
n_neighbors
=
None
,
n_pcs
=
None
):
if
n_neighbors
is
None
:
n_neighbors
=
adata
.
uns
[
'neighbors'
][
'params'
][
'n_neighbors'
]
if
n_pcs
is
None
:
n_pcs
=
adata
.
varm
[
'PCs'
].
shape
[
1
]
neig_name
=
omic_key
+
'_'
+
str
(
n_neighbors
)
+
'_neighbors_'
+
str
(
n_pcs
)
+
'_PCs'
sc
.
pp
.
neighbors
(
adata
,
n_neighbors
=
n_neighbors
,
n_pcs
=
n_pcs
,
use_rep
=
omic_key
+
'_umap'
,
key_added
=
neig_name
)
sc
.
tl
.
leiden
(
adata
,
resolution
=
res
,
key_added
=
omic_key
+
'_clust_'
+
str
(
res
),
neighbors_key
=
neig_name
)
return
(
adata
)
def
weightsInit
(
m
):
if
isinstance
(
m
,
torch
.
nn
.
Linear
):
torch
.
nn
.
init
.
xavier_uniform
(
m
.
weight
.
data
)
m
.
bias
.
data
.
zero_
()
def
omicsIntegration
(
adata
,
transcript_key
=
'trans'
,
variant_key
=
'variant'
,
clf_out
=
25
,
integration_key
=
'int'
):
# forse clf_out andrebbe settato con il numero di tipi cellulari che so essere presenti
# Use integration
rna_X
=
adata
.
uns
[
transcript_key
+
'_X'
]
var_X
=
adata
.
uns
[
variant_key
+
'_X'
]
net
=
pairedIntegration
(
input_dim_a
=
rna_X
.
shape
[
1
],
input_dim_b
=
var_X
.
shape
[
1
],
clf_out
=
clf_out
)
net
.
apply
(
weightsInit
)
# Training process
pairedIntegrationTrainer
(
X_a
=
rna_X
,
X_b
=
var_X
,
model
=
net
,
num_epoch
=
50
)
# Integrates visualization
net
.
to
(
torch
.
device
(
"cpu"
))
X_all_tensor_a
=
torch
.
tensor
(
rna_X
).
float
()
X_all_tensor_b
=
torch
.
tensor
(
var_X
).
float
()
y_pred_a
=
net
.
encoder_a
(
X_all_tensor_a
)
y_pred_a
=
F
.
normalize
(
y_pred_a
,
dim
=
1
,
p
=
2
)
y_pred_a
=
torch
.
Tensor
.
cpu
(
y_pred_a
).
detach
().
numpy
()
y_pred_b
=
net
.
encoder_b
(
X_all_tensor_b
)
y_pred_b
=
F
.
normalize
(
y_pred_b
,
dim
=
1
,
p
=
2
)
y_pred_b
=
torch
.
Tensor
.
cpu
(
y_pred_b
).
detach
().
numpy
()
# Coord Mean
y_pred
=
np
.
concatenate
((
y_pred_a
,
y_pred_b
),
axis
=
0
)
umaps
=
umap
.
UMAP
(
n_neighbors
=
5
,
min_dist
=
0.0
,
n_components
=
2
,
metric
=
"euclidean"
,
random_state
=
42
).
fit
(
y_pred
)
embedding
=
umaps
.
transform
(
y_pred
)
embedding
=
pd
.
DataFrame
(
embedding
)
embedding
.
columns
=
[
'UMAP1'
,
'UMAP2'
]
emb
=
embedding
.
iloc
[:,:
2
].
values
rna_coord
=
emb
[
slice
(
0
,
int
(
emb
.
shape
[
0
]
/
2
))]
variants_coord
=
emb
[
slice
(
int
(
emb
.
shape
[
0
]
/
2
),
int
(
emb
.
shape
[
0
]))]
new_umap
=
list
()
for
i
in
range
(
0
,
rna_coord
.
shape
[
0
]):
X_rna
=
rna_coord
[
i
][
0
]
X_var
=
variants_coord
[
i
][
0
]
coord_X
=
[
X_rna
,
X_var
]
coord_X
=
np
.
mean
(
coord_X
)
Y_rna
=
rna_coord
[
i
][
1
]
Y_var
=
variants_coord
[
i
][
1
]
coord_Y
=
[
Y_rna
,
Y_var
]
coord_Y
=
np
.
mean
(
coord_Y
)
coord
=
[
coord_X
,
coord_Y
]
new_umap
.
append
(
coord
)
new_umap
=
np
.
array
(
new_umap
)
adata
.
obsm
[
integration_key
+
'_umap'
]
=
new_umap
return
(
adata
)
class
pairedIntegration
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_dim_a
=
2000
,
input_dim_b
=
2000
,
clf_out
=
10
):
super
(
pairedIntegration
,
self
).
__init__
()
self
.
input_dim_a
=
input_dim_a
self
.
input_dim_b
=
input_dim_b
self
.
clf_out
=
clf_out
self
.
encoder_a
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
self
.
input_dim_a
,
1000
),
torch
.
nn
.
BatchNorm1d
(
1000
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
1000
,
512
),
torch
.
nn
.
BatchNorm1d
(
512
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
512
,
128
),
torch
.
nn
.
BatchNorm1d
(
128
),
torch
.
nn
.
ReLU
())
self
.
encoder_b
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
self
.
input_dim_b
,
1000
),
torch
.
nn
.
BatchNorm1d
(
1000
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
1000
,
512
),
torch
.
nn
.
BatchNorm1d
(
512
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
512
,
128
),
torch
.
nn
.
BatchNorm1d
(
128
),
torch
.
nn
.
ReLU
())
self
.
clf
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
128
,
self
.
clf_out
),
torch
.
nn
.
Softmax
(
dim
=
1
))
self
.
feature
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
128
,
32
))
def
forward
(
self
,
x_a
,
x_b
):
out_a
=
self
.
encoder_a
(
x_a
)
f_a
=
self
.
feature
(
out_a
)
y_a
=
self
.
clf
(
out_a
)
out_b
=
self
.
encoder_b
(
x_b
)
f_b
=
self
.
feature
(
out_b
)
y_b
=
self
.
clf
(
out_b
)
return
f_a
,
y_a
,
f_b
,
y_b
def
pairedIntegrationTrainer
(
X_a
,
X_b
,
model
,
batch_size
=
512
,
num_epoch
=
5
,
f_temp
=
0.1
,
p_temp
=
1.0
):
device
=
torch
.
device
(
"cpu"
)
f_con
=
contrastiveLoss
(
batch_size
=
batch_size
,
temperature
=
f_temp
)
p_con
=
contrastiveLoss
(
batch_size
=
model
.
clf_out
,
temperature
=
p_temp
)
opt
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
for
k
in
range
(
num_epoch
):
model
.
to
(
device
)
n
=
X_a
.
shape
[
0
]
r
=
np
.
random
.
permutation
(
n
)
X_train_a
=
X_a
[
r
,:]
X_tensor_A
=
torch
.
tensor
(
X_train_a
).
float
()
X_train_b
=
X_b
[
r
,:]
X_tensor_B
=
torch
.
tensor
(
X_train_b
).
float
()
losses
=
0
for
j
in
range
(
n
//
batch_size
):
inputs_a
=
X_tensor_A
[
j
*
batch_size
:(
j
+
1
)
*
batch_size
,:].
to
(
device
)
inputs_a2
=
inputs_a
+
torch
.
normal
(
0
,
1
,
inputs_a
.
shape
).
to
(
device
)
inputs_a
=
inputs_a
+
torch
.
normal
(
0
,
1
,
inputs_a
.
shape
).
to
(
device
)
inputs_b
=
X_tensor_B
[
j
*
batch_size
:(
j
+
1
)
*
batch_size
,:].
to
(
device
)
inputs_b
=
inputs_b
+
torch
.
normal
(
0
,
1
,
inputs_b
.
shape
).
to
(
device
)
feas
,
o
,
nfeas
,
no
=
model
(
inputs_a
,
inputs_b
)
feas2
,
o2
,
_
,
_
=
model
(
inputs_a2
,
inputs_b
)
fea_mi
=
f_con
(
feas
,
nfeas
)
+
f_con
(
feas
,
feas2
)
p_mi
=
p_con
(
o
.
T
,
no
.
T
)
+
p_con
(
o
.
T
,
o2
.
T
)
loss
=
fea_mi
+
p_mi
opt
.
zero_grad
()
loss
.
backward
()
opt
.
step
()
losses
+=
loss
.
data
.
tolist
()
print
(
"Total loss: "
+
str
(
round
(
losses
,
4
)))
gc
.
collect
()
class
contrastiveLoss
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
batch_size
,
temperature
=
0.5
):
super
().
__init__
()
self
.
batch_size
=
batch_size
self
.
register_buffer
(
"temperature"
,
torch
.
tensor
(
temperature
))
self
.
register_buffer
(
"negatives_mask"
,
(
~
torch
.
eye
(
batch_size
*
2
,
batch_size
*
2
,
dtype
=
bool
)).
float
())
def
forward
(
self
,
emb_i
,
emb_j
):
# """
# emb_i and emb_j are batches of embeddings, where corresponding indices are pairs
# z_i, z_j as per SimCLR paper
# """
z_i
=
F
.
normalize
(
emb_i
,
dim
=
1
,
p
=
2
)
z_j
=
F
.
normalize
(
emb_j
,
dim
=
1
,
p
=
2
)
representations
=
torch
.
cat
([
z_i
,
z_j
],
dim
=
0
)
similarity_matrix
=
F
.
cosine_similarity
(
representations
.
unsqueeze
(
1
),
representations
.
unsqueeze
(
0
),
dim
=
2
)
sim_ij
=
torch
.
diag
(
similarity_matrix
,
self
.
batch_size
)
sim_ji
=
torch
.
diag
(
similarity_matrix
,
-
self
.
batch_size
)
positives
=
torch
.
cat
([
sim_ij
,
sim_ji
],
dim
=
0
)
nominator
=
torch
.
exp
(
positives
/
self
.
temperature
)
denominator
=
self
.
negatives_mask
*
torch
.
exp
(
similarity_matrix
/
self
.
temperature
)
loss_partial
=
-
torch
.
log
(
nominator
/
torch
.
sum
(
denominator
,
dim
=
1
))
loss
=
torch
.
sum
(
loss_partial
)
/
(
2
*
self
.
batch_size
)
return
loss
def
distributionClusters
(
adata
,
control_cl
,
group_cl
,
perc_cell_to_show
=
0.1
,
figsize
=
(
12
,
8
),
dpi
=
100
,
save_path
=
None
):
df
=
adata
.
obs
.
groupby
([
group_cl
,
control_cl
]).
size
().
unstack
()
df
=
df
.
loc
[
df
.
sum
(
axis
=
1
)
/
df
.
values
.
sum
()
>=
perc_cell_to_show
,:]
# remove row if group_cluster represents less cells in perc than perc_cell_to_show.
df_rel
=
df
df
=
df
.
div
(
df
.
sum
(
axis
=
1
),
axis
=
0
)
df
[
group_cl
]
=
df
.
index
plt
.
rcParams
[
"figure.figsize"
]
=
figsize
plt
.
rcParams
[
'figure.dpi'
]
=
dpi
plt
.
rcParams
[
'figure.facecolor'
]
=
'#FFFFFF'
df
.
plot
(
x
=
group_cl
,
kind
=
'barh'
,
stacked
=
True
,
mark_right
=
True
,
)
leg
=
plt
.
legend
(
bbox_to_anchor
=
(
1
,
1
),
loc
=
"upper left"
)
leg
.
get_frame
().
set_edgecolor
(
'black'
)
plt
.
xlabel
(
'perc_'
+
control_cl
,
loc
=
'center'
)
for
n
in
df_rel
:
for
i
,
(
pos_x
,
ab
,
value
)
in
enumerate
(
zip
(
df
.
iloc
[:,
:
-
1
].
cumsum
(
1
)[
n
],
df
[
n
],
df_rel
[
n
])):
if
(
value
==
0
)
|
(
ab
<=
0.05
):
value
=
''
plt
.
text
(
pos_x
-
ab
/
2
,
i
,
str
(
value
),
va
=
'center'
,
ha
=
'center'
)
plt
.
grid
(
False
)
if
save_path
is
not
None
:
plt
.
savefig
(
save_path
,
bbox_inches
=
'tight'
)
return
(
plt
.
show
())
setup.py
0 → 100644
View file @
c4cc0487
import
setuptools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
long_description
=
fh
.
read
()
setuptools
.
setup
(
name
=
"scVAR"
,
version
=
"0.0.1"
,
author
=
"Samuele Manessi"
,
author_email
=
"samuele.manessi@itb.cnr.it"
,
description
=
"A tool to integrate genomics and transcriptomics in scRNA-seq data."
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
#url="https://github.com/ILTUOACCOUNT/ILTUOPACCHETTO",
packages
=
setuptools
.
find_packages
(
include
=
[
'scVAR'
,
'scVAR.*'
]),
classifiers
=
[
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
install_requires
=
[
'numpy'
,
'pandas'
,
'scanpy'
,
#'gc',
'torch'
,
'umap'
,
'anndata'
,
'sklearn'
,
'scipy'
,
'matplotlib'
],
python_requires
=
'>=3.7'
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment