Commit 2c405146 authored by Ivan Merelli's avatar Ivan Merelli
Browse files

Initial commit

parents
This diff is collapsed.
####extract data in 10x format for scvar
import pandas as pd
from scipy import io, sparse
import os
def convert_expression(expr_csv, out_dir):
print("🔄 Converting expression data...")
expr = pd.read_csv(expr_csv, index_col=0)
# Save matrix in Matrix Market format
expr_sparse = sparse.csr_matrix(expr.values)
io.mmwrite(os.path.join(out_dir, "matrix.mtx"), expr_sparse)
# Save barcodes
barcodes = pd.Series(expr.columns)
barcodes.to_csv(os.path.join(out_dir, "clean_barcodes.txt"), index=False, header=False)
# Save features.tsv in 10X format: gene_id\tgene_name\tGene Expression
features = pd.DataFrame({
"gene_id": expr.index,
"gene_name": expr.index,
"feature_type": "Gene Expression"
})
features.to_csv(os.path.join(out_dir, "features.tsv"), sep="\t", index=False, header=False)
print("✅ Expression conversion complete!")
def convert_variants(variant_csv, out_dir):
print("🔄 Converting variant data...")
var = pd.read_csv(variant_csv, index_col=0)
# Save matrix in Matrix Market format
var_sparse = sparse.csr_matrix(var.values)
io.mmwrite(os.path.join(out_dir, "consensus_filtered_markdup.mtx"), var_sparse)
# Save barcodes
barcodes = pd.Series(var.columns)
barcodes.to_csv(os.path.join(out_dir, "barcodes_var.tsv"), index=False, header=False)
# Save variant list
variants = pd.Series(var.index)
variants.to_csv(os.path.join(out_dir, "variants_filtered_markdup.txt"), index=False, header=False)
print("✅ Variant conversion complete!")
def main():
import argparse
parser = argparse.ArgumentParser(description="Convert synthetic scRNA-seq and variant data into scVAR-compatible format.")
parser.add_argument("--expression_csv", required=True, help="Path to expression.csv")
parser.add_argument("--variant_csv", required=True, help="Path to variants.csv")
parser.add_argument("--output_dir", required=True, help="Directory to save scVAR-compatible files")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
convert_expression(args.expression_csv, args.output_dir)
convert_variants(args.variant_csv, args.output_dir)
if __name__ == "__main__":
main()
import argparse
import numpy as np
import pandas as pd
import random
def generate_data(N, F, V, T, G,
expr_sparsity, var_sparsity,
expr_error, var_error, label_noise,
expr_out, var_out, truth_out, cell_metadata_out):
np.random.seed(42)
random.seed(42)
# Define labels
cell_types = [f"CellType_{i}" for i in range(T)]
genotypes = [f"Genotype_{i}" for i in range(G)]
cells = [f"Cell_{i}" for i in range(N)]
# Assign true labels
true_cell_types = np.random.choice(cell_types, size=N)
celltype_to_genotypes = {
ct: sorted(random.sample(genotypes, random.randint(1, G))) for ct in cell_types
}
true_genotypes = [
random.choice(celltype_to_genotypes[ct]) for ct in true_cell_types
]
# Corrupt labels if needed
noisy_cell_types = true_cell_types.copy()
noisy_genotypes = true_genotypes.copy()
n_noisy = int(label_noise * N)
noisy_indices = np.random.choice(N, n_noisy, replace=False)
for i in noisy_indices:
wrong_ct = list(set(cell_types) - {true_cell_types[i]})
noisy_cell_types[i] = random.choice(wrong_ct)
wrong_genos = list(set(genotypes) - set(celltype_to_genotypes[true_cell_types[i]]))
if wrong_genos:
noisy_genotypes[i] = random.choice(wrong_genos)
# ---------------------------------------------
# Generate reference transcriptome per CellType
# ---------------------------------------------
expr_reference = {
ct: np.random.lognormal(mean=1.0, sigma=0.5, size=F)
for ct in cell_types
}
# Per-gene noise variance for each cell type (gene-specific)
gene_noise_factors = np.random.uniform(0.5, 1.5, size=F)
# Generate expression matrix with noise per cell
expr_data = np.zeros((F, N)) # genes x cells
for i in range(N):
ct = noisy_cell_types[i]
profile = expr_reference[ct]
noise_std = expr_error * gene_noise_factors
expr_data[:, i] = np.random.normal(loc=profile, scale=noise_std)
# Apply sparsity (dropouts): zero out expr_sparsity% of genes in each cell
for i in range(N):
zero_indices = np.random.choice(F, size=int(expr_sparsity * F), replace=False)
expr_data[zero_indices, i] = 0.0
expr_data[np.abs(expr_data) < 1e-10] = 0.0 # clean -0
expr_df = pd.DataFrame(expr_data, index=[f"Gene_{i}" for i in range(F)], columns=cells)
expr_df.to_csv(expr_out)
# ------------------------------------------
# Generate reference genotype profiles
# ------------------------------------------
variant_reference = {
g: np.random.binomial(1, 0.1, size=V)
for g in genotypes
}
# Generate variant matrix for each cell from its genotype
var_data = np.zeros((V, N), dtype=int)
for i in range(N):
g = noisy_genotypes[i]
profile = variant_reference[g].copy()
# Introduce random changes: replace var_error * V entries with 0 or 1
n_changes = int(var_error * V)
indices_to_modify = np.random.choice(V, size=n_changes, replace=False)
profile[indices_to_modify] = np.random.randint(0, 2, size=n_changes)
# Apply sparsity (dropouts): zero out var_sparsity% of variants
n_zeros = int(var_sparsity * V)
zero_indices = np.random.choice(V, size=n_zeros, replace=False)
profile[zero_indices] = 0
var_data[:, i] = profile
var_df = pd.DataFrame(var_data, index=[f"Variant_{i}" for i in range(V)], columns=cells)
var_df.to_csv(var_out)
# ------------------------------------------
# Save CellType <-> Genotype mapping (truth)
# ------------------------------------------
ct_gt = pd.DataFrame(
[(ct, g) for ct, gs in celltype_to_genotypes.items() for g in gs],
columns=["CellType", "Genotype"]
)
ct_gt.to_csv(truth_out, index=False)
# ------------------------------------------
# Save per-cell metadata
# ------------------------------------------
metadata_df = pd.DataFrame({
"Cell": cells,
"CellType": noisy_cell_types,
"Genotype": noisy_genotypes,
"TrueCellType": true_cell_types,
"TrueGenotype": true_genotypes
})
metadata_df.to_csv(cell_metadata_out, index=False)
print(f"✔ Expression matrix saved to: {expr_out}")
print(f"✔ Variant matrix saved to: {var_out}")
print(f"✔ CellType–Genotype mapping saved to: {truth_out}")
print(f"✔ Cell metadata (true + noisy labels) saved to: {cell_metadata_out}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate synthetic single-cell data with realistic transcriptome and variant noise.")
parser.add_argument("--cells", type=int, required=True)
parser.add_argument("--genes", type=int, required=True)
parser.add_argument("--variants", type=int, required=True)
parser.add_argument("--celltypes", type=int, required=True)
parser.add_argument("--genotypes", type=int, required=True)
parser.add_argument("--expr_out", type=str, required=True)
parser.add_argument("--var_out", type=str, required=True)
parser.add_argument("--truth_out", type=str, required=True)
parser.add_argument("--cell_metadata_out", type=str, required=True)
# Sparsity = dropout-like effect
parser.add_argument("--expr_sparsity", type=float, default=0.3,
help="Fraction of genes set to 0 in each cell (simulates dropout)")
parser.add_argument("--var_sparsity", type=float, default=0.1,
help="Fraction of variants set to 0 in each cell (simulates missed calls)")
# Noise = deviation from reference profile
parser.add_argument("--expr_error", type=float, default=0.3,
help="Amount of expression noise per gene per cell")
parser.add_argument("--var_error", type=float, default=0.3,
help="Fraction of variants changed arbitrarily per cell")
# Label noise = wrong labels
parser.add_argument("--label_noise", type=float, default=0.0,
help="Fraction of cells with incorrect celltype/genotype labels")
args = parser.parse_args()
generate_data(
N=args.cells,
F=args.genes,
V=args.variants,
T=args.celltypes,
G=args.genotypes,
expr_sparsity=args.expr_sparsity,
var_sparsity=args.var_sparsity,
expr_error=args.expr_error,
var_error=args.var_error,
label_noise=args.label_noise,
expr_out=args.expr_out,
var_out=args.var_out,
truth_out=args.truth_out,
cell_metadata_out=args.cell_metadata_out,
)
import setuptools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="scVAR",
version="0.0.2",
author="Ivan Merelli",
author_email="ivan.merelli@itb.cnr.it",
description="A tool to integrate genomics and transcriptomics in scRNA-seq data.",
long_description=long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(include=['scVAR', 'scVAR.*']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires = [
'numpy',
'pandas',
'scanpy',
'torch',
'umap',
'leidenalg',
'igraph',
'anndata',
'scikit-learn',
'scipy',
'matplotlib'
],
python_requires='>=3.10',
)
../tests
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment