Commit 6a3d172f authored by Ivan Merelli's avatar Ivan Merelli
Browse files

Added synthetic data generator

parent 13ac2ab8
This diff is collapsed.
This diff is collapsed.
import argparse
import numpy as np
import pandas as pd
import random
def generate_data(N, F, V, T, G,
expr_sparsity, var_sparsity,
expr_error, var_error, label_noise,
expr_out, var_out, truth_out, cell_metadata_out):
np.random.seed(42)
random.seed(42)
# Define labels
cell_types = [f"CellType_{i}" for i in range(T)]
genotypes = [f"Genotype_{i}" for i in range(G)]
cells = [f"Cell_{i}" for i in range(N)]
# Assign true labels
true_cell_types = np.random.choice(cell_types, size=N)
celltype_to_genotypes = {
ct: sorted(random.sample(genotypes, random.randint(1, G))) for ct in cell_types
}
true_genotypes = [
random.choice(celltype_to_genotypes[ct]) for ct in true_cell_types
]
# Corrupt labels if needed
noisy_cell_types = true_cell_types.copy()
noisy_genotypes = true_genotypes.copy()
n_noisy = int(label_noise * N)
noisy_indices = np.random.choice(N, n_noisy, replace=False)
for i in noisy_indices:
wrong_ct = list(set(cell_types) - {true_cell_types[i]})
noisy_cell_types[i] = random.choice(wrong_ct)
wrong_genos = list(set(genotypes) - set(celltype_to_genotypes[true_cell_types[i]]))
if wrong_genos:
noisy_genotypes[i] = random.choice(wrong_genos)
# ---------------------------------------------
# Generate reference transcriptome per CellType
# ---------------------------------------------
expr_reference = {
ct: np.random.lognormal(mean=1.0, sigma=0.5, size=F)
for ct in cell_types
}
# Per-gene noise variance for each cell type (gene-specific)
gene_noise_factors = np.random.uniform(0.5, 1.5, size=F)
# Generate expression matrix with noise per cell
expr_data = np.zeros((F, N)) # genes x cells
for i in range(N):
ct = noisy_cell_types[i]
profile = expr_reference[ct]
noise_std = expr_error * gene_noise_factors
expr_data[:, i] = np.random.normal(loc=profile, scale=noise_std)
# Apply sparsity (dropouts): zero out expr_sparsity% of genes in each cell
for i in range(N):
zero_indices = np.random.choice(F, size=int(expr_sparsity * F), replace=False)
expr_data[zero_indices, i] = 0.0
expr_data[np.abs(expr_data) < 1e-10] = 0.0 # clean -0
expr_df = pd.DataFrame(expr_data, index=[f"Gene_{i}" for i in range(F)], columns=cells)
expr_df.to_csv(expr_out)
# ------------------------------------------
# Generate reference genotype profiles
# ------------------------------------------
variant_reference = {
g: np.random.binomial(1, 0.1, size=V)
for g in genotypes
}
# Generate variant matrix for each cell from its genotype
var_data = np.zeros((V, N), dtype=int)
for i in range(N):
g = noisy_genotypes[i]
profile = variant_reference[g].copy()
# Introduce random changes: replace var_error * V entries with 0 or 1
n_changes = int(var_error * V)
indices_to_modify = np.random.choice(V, size=n_changes, replace=False)
profile[indices_to_modify] = np.random.randint(0, 2, size=n_changes)
# Apply sparsity (dropouts): zero out var_sparsity% of variants
n_zeros = int(var_sparsity * V)
zero_indices = np.random.choice(V, size=n_zeros, replace=False)
profile[zero_indices] = 0
var_data[:, i] = profile
var_df = pd.DataFrame(var_data, index=[f"Variant_{i}" for i in range(V)], columns=cells)
var_df.to_csv(var_out)
# ------------------------------------------
# Save CellType <-> Genotype mapping (truth)
# ------------------------------------------
ct_gt = pd.DataFrame(
[(ct, g) for ct, gs in celltype_to_genotypes.items() for g in gs],
columns=["CellType", "Genotype"]
)
ct_gt.to_csv(truth_out, index=False)
# ------------------------------------------
# Save per-cell metadata
# ------------------------------------------
metadata_df = pd.DataFrame({
"Cell": cells,
"CellType": noisy_cell_types,
"Genotype": noisy_genotypes,
"TrueCellType": true_cell_types,
"TrueGenotype": true_genotypes
})
metadata_df.to_csv(cell_metadata_out, index=False)
print(f"✔ Expression matrix saved to: {expr_out}")
print(f"✔ Variant matrix saved to: {var_out}")
print(f"✔ CellType–Genotype mapping saved to: {truth_out}")
print(f"✔ Cell metadata (true + noisy labels) saved to: {cell_metadata_out}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate synthetic single-cell data with realistic transcriptome and variant noise.")
parser.add_argument("--cells", type=int, required=True)
parser.add_argument("--genes", type=int, required=True)
parser.add_argument("--variants", type=int, required=True)
parser.add_argument("--celltypes", type=int, required=True)
parser.add_argument("--genotypes", type=int, required=True)
parser.add_argument("--expr_out", type=str, required=True)
parser.add_argument("--var_out", type=str, required=True)
parser.add_argument("--truth_out", type=str, required=True)
parser.add_argument("--cell_metadata_out", type=str, required=True)
# Sparsity = dropout-like effect
parser.add_argument("--expr_sparsity", type=float, default=0.3,
help="Fraction of genes set to 0 in each cell (simulates dropout)")
parser.add_argument("--var_sparsity", type=float, default=0.1,
help="Fraction of variants set to 0 in each cell (simulates missed calls)")
# Noise = deviation from reference profile
parser.add_argument("--expr_error", type=float, default=0.3,
help="Amount of expression noise per gene per cell")
parser.add_argument("--var_error", type=float, default=0.3,
help="Fraction of variants changed arbitrarily per cell")
# Label noise = wrong labels
parser.add_argument("--label_noise", type=float, default=0.0,
help="Fraction of cells with incorrect celltype/genotype labels")
args = parser.parse_args()
generate_data(
N=args.cells,
F=args.genes,
V=args.variants,
T=args.celltypes,
G=args.genotypes,
expr_sparsity=args.expr_sparsity,
var_sparsity=args.var_sparsity,
expr_error=args.expr_error,
var_error=args.var_error,
label_noise=args.label_noise,
expr_out=args.expr_out,
var_out=args.var_out,
truth_out=args.truth_out,
cell_metadata_out=args.cell_metadata_out,
)
CellType,Genotype
CellType_0,Genotype_0
CellType_1,Genotype_0
CellType_1,Genotype_1
CellType_1,Genotype_2
CellType_2,Genotype_0
CellType_3,Genotype_0
CellType_3,Genotype_1
CellType_3,Genotype_2
CellType_3,Genotype_3
CellType_4,Genotype_0
CellType_4,Genotype_2
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment