Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
custom
scVAR
Commits
2c405146
Commit
2c405146
authored
Nov 25, 2025
by
Ivan Merelli
Browse files
Initial commit
parents
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
scVAR/__pycache__/scVAR.cpython-310.pyc
0 → 100644
View file @
2c405146
File added
scVAR/scVAR.py
0 → 100644
View file @
2c405146
This diff is collapsed.
Click to expand it.
sc_gen/convert_to_scvar_format.py
0 → 100644
View file @
2c405146
####extract data in 10x format for scvar
import
pandas
as
pd
from
scipy
import
io
,
sparse
import
os
def
convert_expression
(
expr_csv
,
out_dir
):
print
(
"🔄 Converting expression data..."
)
expr
=
pd
.
read_csv
(
expr_csv
,
index_col
=
0
)
# Save matrix in Matrix Market format
expr_sparse
=
sparse
.
csr_matrix
(
expr
.
values
)
io
.
mmwrite
(
os
.
path
.
join
(
out_dir
,
"matrix.mtx"
),
expr_sparse
)
# Save barcodes
barcodes
=
pd
.
Series
(
expr
.
columns
)
barcodes
.
to_csv
(
os
.
path
.
join
(
out_dir
,
"clean_barcodes.txt"
),
index
=
False
,
header
=
False
)
# Save features.tsv in 10X format: gene_id\tgene_name\tGene Expression
features
=
pd
.
DataFrame
({
"gene_id"
:
expr
.
index
,
"gene_name"
:
expr
.
index
,
"feature_type"
:
"Gene Expression"
})
features
.
to_csv
(
os
.
path
.
join
(
out_dir
,
"features.tsv"
),
sep
=
"
\t
"
,
index
=
False
,
header
=
False
)
print
(
"✅ Expression conversion complete!"
)
def
convert_variants
(
variant_csv
,
out_dir
):
print
(
"🔄 Converting variant data..."
)
var
=
pd
.
read_csv
(
variant_csv
,
index_col
=
0
)
# Save matrix in Matrix Market format
var_sparse
=
sparse
.
csr_matrix
(
var
.
values
)
io
.
mmwrite
(
os
.
path
.
join
(
out_dir
,
"consensus_filtered_markdup.mtx"
),
var_sparse
)
# Save barcodes
barcodes
=
pd
.
Series
(
var
.
columns
)
barcodes
.
to_csv
(
os
.
path
.
join
(
out_dir
,
"barcodes_var.tsv"
),
index
=
False
,
header
=
False
)
# Save variant list
variants
=
pd
.
Series
(
var
.
index
)
variants
.
to_csv
(
os
.
path
.
join
(
out_dir
,
"variants_filtered_markdup.txt"
),
index
=
False
,
header
=
False
)
print
(
"✅ Variant conversion complete!"
)
def
main
():
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Convert synthetic scRNA-seq and variant data into scVAR-compatible format."
)
parser
.
add_argument
(
"--expression_csv"
,
required
=
True
,
help
=
"Path to expression.csv"
)
parser
.
add_argument
(
"--variant_csv"
,
required
=
True
,
help
=
"Path to variants.csv"
)
parser
.
add_argument
(
"--output_dir"
,
required
=
True
,
help
=
"Directory to save scVAR-compatible files"
)
args
=
parser
.
parse_args
()
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
convert_expression
(
args
.
expression_csv
,
args
.
output_dir
)
convert_variants
(
args
.
variant_csv
,
args
.
output_dir
)
if
__name__
==
"__main__"
:
main
()
sc_gen/generate_synthetic_singlecell.py
0 → 100644
View file @
2c405146
import
argparse
import
numpy
as
np
import
pandas
as
pd
import
random
def
generate_data
(
N
,
F
,
V
,
T
,
G
,
expr_sparsity
,
var_sparsity
,
expr_error
,
var_error
,
label_noise
,
expr_out
,
var_out
,
truth_out
,
cell_metadata_out
):
np
.
random
.
seed
(
42
)
random
.
seed
(
42
)
# Define labels
cell_types
=
[
f
"CellType_
{
i
}
"
for
i
in
range
(
T
)]
genotypes
=
[
f
"Genotype_
{
i
}
"
for
i
in
range
(
G
)]
cells
=
[
f
"Cell_
{
i
}
"
for
i
in
range
(
N
)]
# Assign true labels
true_cell_types
=
np
.
random
.
choice
(
cell_types
,
size
=
N
)
celltype_to_genotypes
=
{
ct
:
sorted
(
random
.
sample
(
genotypes
,
random
.
randint
(
1
,
G
)))
for
ct
in
cell_types
}
true_genotypes
=
[
random
.
choice
(
celltype_to_genotypes
[
ct
])
for
ct
in
true_cell_types
]
# Corrupt labels if needed
noisy_cell_types
=
true_cell_types
.
copy
()
noisy_genotypes
=
true_genotypes
.
copy
()
n_noisy
=
int
(
label_noise
*
N
)
noisy_indices
=
np
.
random
.
choice
(
N
,
n_noisy
,
replace
=
False
)
for
i
in
noisy_indices
:
wrong_ct
=
list
(
set
(
cell_types
)
-
{
true_cell_types
[
i
]})
noisy_cell_types
[
i
]
=
random
.
choice
(
wrong_ct
)
wrong_genos
=
list
(
set
(
genotypes
)
-
set
(
celltype_to_genotypes
[
true_cell_types
[
i
]]))
if
wrong_genos
:
noisy_genotypes
[
i
]
=
random
.
choice
(
wrong_genos
)
# ---------------------------------------------
# Generate reference transcriptome per CellType
# ---------------------------------------------
expr_reference
=
{
ct
:
np
.
random
.
lognormal
(
mean
=
1.0
,
sigma
=
0.5
,
size
=
F
)
for
ct
in
cell_types
}
# Per-gene noise variance for each cell type (gene-specific)
gene_noise_factors
=
np
.
random
.
uniform
(
0.5
,
1.5
,
size
=
F
)
# Generate expression matrix with noise per cell
expr_data
=
np
.
zeros
((
F
,
N
))
# genes x cells
for
i
in
range
(
N
):
ct
=
noisy_cell_types
[
i
]
profile
=
expr_reference
[
ct
]
noise_std
=
expr_error
*
gene_noise_factors
expr_data
[:,
i
]
=
np
.
random
.
normal
(
loc
=
profile
,
scale
=
noise_std
)
# Apply sparsity (dropouts): zero out expr_sparsity% of genes in each cell
for
i
in
range
(
N
):
zero_indices
=
np
.
random
.
choice
(
F
,
size
=
int
(
expr_sparsity
*
F
),
replace
=
False
)
expr_data
[
zero_indices
,
i
]
=
0.0
expr_data
[
np
.
abs
(
expr_data
)
<
1e-10
]
=
0.0
# clean -0
expr_df
=
pd
.
DataFrame
(
expr_data
,
index
=
[
f
"Gene_
{
i
}
"
for
i
in
range
(
F
)],
columns
=
cells
)
expr_df
.
to_csv
(
expr_out
)
# ------------------------------------------
# Generate reference genotype profiles
# ------------------------------------------
variant_reference
=
{
g
:
np
.
random
.
binomial
(
1
,
0.1
,
size
=
V
)
for
g
in
genotypes
}
# Generate variant matrix for each cell from its genotype
var_data
=
np
.
zeros
((
V
,
N
),
dtype
=
int
)
for
i
in
range
(
N
):
g
=
noisy_genotypes
[
i
]
profile
=
variant_reference
[
g
].
copy
()
# Introduce random changes: replace var_error * V entries with 0 or 1
n_changes
=
int
(
var_error
*
V
)
indices_to_modify
=
np
.
random
.
choice
(
V
,
size
=
n_changes
,
replace
=
False
)
profile
[
indices_to_modify
]
=
np
.
random
.
randint
(
0
,
2
,
size
=
n_changes
)
# Apply sparsity (dropouts): zero out var_sparsity% of variants
n_zeros
=
int
(
var_sparsity
*
V
)
zero_indices
=
np
.
random
.
choice
(
V
,
size
=
n_zeros
,
replace
=
False
)
profile
[
zero_indices
]
=
0
var_data
[:,
i
]
=
profile
var_df
=
pd
.
DataFrame
(
var_data
,
index
=
[
f
"Variant_
{
i
}
"
for
i
in
range
(
V
)],
columns
=
cells
)
var_df
.
to_csv
(
var_out
)
# ------------------------------------------
# Save CellType <-> Genotype mapping (truth)
# ------------------------------------------
ct_gt
=
pd
.
DataFrame
(
[(
ct
,
g
)
for
ct
,
gs
in
celltype_to_genotypes
.
items
()
for
g
in
gs
],
columns
=
[
"CellType"
,
"Genotype"
]
)
ct_gt
.
to_csv
(
truth_out
,
index
=
False
)
# ------------------------------------------
# Save per-cell metadata
# ------------------------------------------
metadata_df
=
pd
.
DataFrame
({
"Cell"
:
cells
,
"CellType"
:
noisy_cell_types
,
"Genotype"
:
noisy_genotypes
,
"TrueCellType"
:
true_cell_types
,
"TrueGenotype"
:
true_genotypes
})
metadata_df
.
to_csv
(
cell_metadata_out
,
index
=
False
)
print
(
f
"✔ Expression matrix saved to:
{
expr_out
}
"
)
print
(
f
"✔ Variant matrix saved to:
{
var_out
}
"
)
print
(
f
"✔ CellType–Genotype mapping saved to:
{
truth_out
}
"
)
print
(
f
"✔ Cell metadata (true + noisy labels) saved to:
{
cell_metadata_out
}
"
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Generate synthetic single-cell data with realistic transcriptome and variant noise."
)
parser
.
add_argument
(
"--cells"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--genes"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--variants"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--celltypes"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--genotypes"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--expr_out"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--var_out"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--truth_out"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--cell_metadata_out"
,
type
=
str
,
required
=
True
)
# Sparsity = dropout-like effect
parser
.
add_argument
(
"--expr_sparsity"
,
type
=
float
,
default
=
0.3
,
help
=
"Fraction of genes set to 0 in each cell (simulates dropout)"
)
parser
.
add_argument
(
"--var_sparsity"
,
type
=
float
,
default
=
0.1
,
help
=
"Fraction of variants set to 0 in each cell (simulates missed calls)"
)
# Noise = deviation from reference profile
parser
.
add_argument
(
"--expr_error"
,
type
=
float
,
default
=
0.3
,
help
=
"Amount of expression noise per gene per cell"
)
parser
.
add_argument
(
"--var_error"
,
type
=
float
,
default
=
0.3
,
help
=
"Fraction of variants changed arbitrarily per cell"
)
# Label noise = wrong labels
parser
.
add_argument
(
"--label_noise"
,
type
=
float
,
default
=
0.0
,
help
=
"Fraction of cells with incorrect celltype/genotype labels"
)
args
=
parser
.
parse_args
()
generate_data
(
N
=
args
.
cells
,
F
=
args
.
genes
,
V
=
args
.
variants
,
T
=
args
.
celltypes
,
G
=
args
.
genotypes
,
expr_sparsity
=
args
.
expr_sparsity
,
var_sparsity
=
args
.
var_sparsity
,
expr_error
=
args
.
expr_error
,
var_error
=
args
.
var_error
,
label_noise
=
args
.
label_noise
,
expr_out
=
args
.
expr_out
,
var_out
=
args
.
var_out
,
truth_out
=
args
.
truth_out
,
cell_metadata_out
=
args
.
cell_metadata_out
,
)
setup.py
0 → 100644
View file @
2c405146
import
setuptools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
long_description
=
fh
.
read
()
setuptools
.
setup
(
name
=
"scVAR"
,
version
=
"0.0.2"
,
author
=
"Ivan Merelli"
,
author_email
=
"ivan.merelli@itb.cnr.it"
,
description
=
"A tool to integrate genomics and transcriptomics in scRNA-seq data."
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
packages
=
setuptools
.
find_packages
(
include
=
[
'scVAR'
,
'scVAR.*'
]),
classifiers
=
[
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
install_requires
=
[
'numpy'
,
'pandas'
,
'scanpy'
,
'torch'
,
'umap'
,
'leidenalg'
,
'igraph'
,
'anndata'
,
'scikit-learn'
,
'scipy'
,
'matplotlib'
],
python_requires
=
'>=3.10'
,
)
tests
0 → 120000
View file @
2c405146
../tests
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment