import argparse import numpy as np import pandas as pd import random def generate_data(N, F, V, T, G, expr_sparsity, var_sparsity, expr_error, var_error, label_noise, expr_out, var_out, truth_out, cell_metadata_out): np.random.seed(42) random.seed(42) # Define labels cell_types = [f"CellType_{i}" for i in range(T)] genotypes = [f"Genotype_{i}" for i in range(G)] cells = [f"Cell_{i}" for i in range(N)] # Assign true labels true_cell_types = np.random.choice(cell_types, size=N) celltype_to_genotypes = { ct: sorted(random.sample(genotypes, random.randint(1, G))) for ct in cell_types } true_genotypes = [ random.choice(celltype_to_genotypes[ct]) for ct in true_cell_types ] # Corrupt labels if needed noisy_cell_types = true_cell_types.copy() noisy_genotypes = true_genotypes.copy() n_noisy = int(label_noise * N) noisy_indices = np.random.choice(N, n_noisy, replace=False) for i in noisy_indices: wrong_ct = list(set(cell_types) - {true_cell_types[i]}) noisy_cell_types[i] = random.choice(wrong_ct) wrong_genos = list(set(genotypes) - set(celltype_to_genotypes[true_cell_types[i]])) if wrong_genos: noisy_genotypes[i] = random.choice(wrong_genos) # --------------------------------------------- # Generate reference transcriptome per CellType # --------------------------------------------- expr_reference = { ct: np.random.lognormal(mean=1.0, sigma=0.5, size=F) for ct in cell_types } # Per-gene noise variance for each cell type (gene-specific) gene_noise_factors = np.random.uniform(0.5, 1.5, size=F) # Generate expression matrix with noise per cell expr_data = np.zeros((F, N)) # genes x cells for i in range(N): ct = noisy_cell_types[i] profile = expr_reference[ct] noise_std = expr_error * gene_noise_factors expr_data[:, i] = np.random.normal(loc=profile, scale=noise_std) # Apply sparsity (dropouts): zero out expr_sparsity% of genes in each cell for i in range(N): zero_indices = np.random.choice(F, size=int(expr_sparsity * F), replace=False) expr_data[zero_indices, i] = 0.0 expr_data[np.abs(expr_data) < 1e-10] = 0.0 # clean -0 expr_df = pd.DataFrame(expr_data, index=[f"Gene_{i}" for i in range(F)], columns=cells) expr_df.to_csv(expr_out) # ------------------------------------------ # Generate reference genotype profiles # ------------------------------------------ variant_reference = { g: np.random.binomial(1, 0.1, size=V) for g in genotypes } # Generate variant matrix for each cell from its genotype var_data = np.zeros((V, N), dtype=int) for i in range(N): g = noisy_genotypes[i] profile = variant_reference[g].copy() # Introduce random changes: replace var_error * V entries with 0 or 1 n_changes = int(var_error * V) indices_to_modify = np.random.choice(V, size=n_changes, replace=False) profile[indices_to_modify] = np.random.randint(0, 2, size=n_changes) # Apply sparsity (dropouts): zero out var_sparsity% of variants n_zeros = int(var_sparsity * V) zero_indices = np.random.choice(V, size=n_zeros, replace=False) profile[zero_indices] = 0 var_data[:, i] = profile var_df = pd.DataFrame(var_data, index=[f"Variant_{i}" for i in range(V)], columns=cells) var_df.to_csv(var_out) # ------------------------------------------ # Save CellType <-> Genotype mapping (truth) # ------------------------------------------ ct_gt = pd.DataFrame( [(ct, g) for ct, gs in celltype_to_genotypes.items() for g in gs], columns=["CellType", "Genotype"] ) ct_gt.to_csv(truth_out, index=False) # ------------------------------------------ # Save per-cell metadata # ------------------------------------------ metadata_df = pd.DataFrame({ "Cell": cells, "CellType": noisy_cell_types, "Genotype": noisy_genotypes, "TrueCellType": true_cell_types, "TrueGenotype": true_genotypes }) metadata_df.to_csv(cell_metadata_out, index=False) print(f"✔ Expression matrix saved to: {expr_out}") print(f"✔ Variant matrix saved to: {var_out}") print(f"✔ CellType–Genotype mapping saved to: {truth_out}") print(f"✔ Cell metadata (true + noisy labels) saved to: {cell_metadata_out}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate synthetic single-cell data with realistic transcriptome and variant noise.") parser.add_argument("--cells", type=int, required=True) parser.add_argument("--genes", type=int, required=True) parser.add_argument("--variants", type=int, required=True) parser.add_argument("--celltypes", type=int, required=True) parser.add_argument("--genotypes", type=int, required=True) parser.add_argument("--expr_out", type=str, required=True) parser.add_argument("--var_out", type=str, required=True) parser.add_argument("--truth_out", type=str, required=True) parser.add_argument("--cell_metadata_out", type=str, required=True) # Sparsity = dropout-like effect parser.add_argument("--expr_sparsity", type=float, default=0.3, help="Fraction of genes set to 0 in each cell (simulates dropout)") parser.add_argument("--var_sparsity", type=float, default=0.1, help="Fraction of variants set to 0 in each cell (simulates missed calls)") # Noise = deviation from reference profile parser.add_argument("--expr_error", type=float, default=0.3, help="Amount of expression noise per gene per cell") parser.add_argument("--var_error", type=float, default=0.3, help="Fraction of variants changed arbitrarily per cell") # Label noise = wrong labels parser.add_argument("--label_noise", type=float, default=0.0, help="Fraction of cells with incorrect celltype/genotype labels") args = parser.parse_args() generate_data( N=args.cells, F=args.genes, V=args.variants, T=args.celltypes, G=args.genotypes, expr_sparsity=args.expr_sparsity, var_sparsity=args.var_sparsity, expr_error=args.expr_error, var_error=args.var_error, label_noise=args.label_noise, expr_out=args.expr_out, var_out=args.var_out, truth_out=args.truth_out, cell_metadata_out=args.cell_metadata_out, )