{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"id": "2fe6811b-dc90-4869-880c-3686f5da116f",
"metadata": {},
"outputs": [],
"source": [
"# !pip install muon scanpy mofapy2 # se servisse\n",
"import os\n",
"out_dir = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/\"\n",
"os.makedirs(out_dir, exist_ok=True)\n",
"\n",
"# Config Scanpy\n",
"import scanpy as sc\n",
"sc.settings.figdir = os.path.join(out_dir, \"figures\")\n",
"os.makedirs(sc.settings.figdir, exist_ok=True)\n",
"sc.settings.autoshow = False"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "09df59a3-8faf-48f7-bbc5-102b750134ab",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scanpy as sc\n",
"import muon as mu\n",
"from anndata import AnnData\n",
"from sklearn.metrics import adjusted_rand_score\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8fa241bd-d809-4c1b-a18d-90462ec6e255",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RNA shape (genes × cells): (10000, 5000)\nDNA shape (variants × cells): (20000, 5000)",
"\n[DEBUG] Prime 5×5 celle RNA (righe=celle, colonne=geni):\n[[1.5445 0. 3.065 1.8446 2.3458]\n [0. 1.4837 0. 0. 1.6266]\n [1.3126 2.223 4.4167 4.5424 0. ]\n [5.8761 5.6104 4.0811 0. 4.1624]\n [2.2925 1.86 0. 0. 2.099 ]]\n\n[DEBUG] Prime 5×5 celle DNA (righe=celle, colonne=varianti):\n[[0 0 1 0 0]\n [0 0 0 0 0]\n [1 0 1 0 0]\n [0 1 0 0 0]\n [0 0 1 0 0]]",
"\n[INFO] RNA — valore medio: 2.1562, std: 1.9827",
"[INFO] DNA — valore medio: 0.1973, std: 0.3979"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from anndata import AnnData\n",
"\n",
"rna_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/gene_expression.csv\"\n",
"dna_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/variant_matrix.csv\"\n",
"\n",
"rna_df = pd.read_csv(rna_csv, index_col=0)\n",
"dna_df = pd.read_csv(dna_csv, index_col=0)\n",
"\n",
"print(f\"RNA shape (genes × cells): {rna_df.shape}\")\n",
"print(f\"DNA shape (variants × cells): {dna_df.shape}\")\n",
"\n",
"# Crea AnnData\n",
"rna_adata = AnnData(rna_df.T.copy())\n",
"dna_adata = AnnData(dna_df.T.copy())\n",
"\n",
"rna_adata.var_names.name = \"gene\"\n",
"dna_adata.var_names.name = \"variant\"\n",
"\n",
"# Verifica coerenza dei barcode\n",
"assert (rna_adata.obs_names == dna_adata.obs_names).all(), \"Cell names don't match!\"\n",
"\n",
"# === Debug: prime 5×5 celle per ciascuna omica ===\n",
"print(\"\\n[DEBUG] Prime 5×5 celle RNA (righe=celle, colonne=geni):\")\n",
"print(np.round(rna_adata.X[:5, :5], 4))\n",
"\n",
"print(\"\\n[DEBUG] Prime 5×5 celle DNA (righe=celle, colonne=varianti):\")\n",
"print(np.round(dna_adata.X[:5, :5], 4))\n",
"\n",
"# === Piccolo riassunto dei valori ===\n",
"print(f\"\\n[INFO] RNA — valore medio: {rna_adata.X.mean():.4f}, std: {rna_adata.X.std():.4f}\")\n",
"print(f\"[INFO] DNA — valore medio: {dna_adata.X.mean():.4f}, std: {dna_adata.X.std():.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b7824e6e-e6c4-4281-a4da-5a1b827fa000",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MuData object with n_obs × n_vars = 5000 × 30000\n 2 modalities\n rna:\t5000 x 10000\n dna:\t5000 x 20000"
]
}
],
"source": [
"mdata = mu.MuData({\"rna\": rna_adata, \"dna\": dna_adata})\n",
"mdata.write(os.path.join(out_dir, \"mdata_raw.h5mu\"))\n",
"print(mdata)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "90ce219f-7613-4e6d-8675-d81b1b6cc6a7",
"metadata": {},
"outputs": [],
"source": [
"#sc.pp.normalize_total(mdata[\"rna\"])\n",
"#sc.pp.log1p(mdata[\"rna\"])\n",
"#sc.pp.highly_variable_genes(mdata[\"rna\"], flavor=\"cell_ranger\", n_top_genes=2000)\n",
"#mdata.mod[\"rna\"] = mdata[\"rna\"][:, mdata[\"rna\"].var.highly_variable].copy()\n",
"sc.pp.scale(mdata[\"rna\"], max_value=10)\n",
"sc.tl.pca(mdata[\"rna\"], n_comps=50)\n",
"sc.pp.neighbors(mdata[\"rna\"], n_pcs=30)\n",
"sc.tl.umap(mdata[\"rna\"])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c44f00fe-fb8e-4fb3-8a85-0766d262c03e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] Preprocessing DNA binario completato."
]
}
],
"source": [
"import numpy as np\n",
"import scanpy as sc\n",
"\n",
"# Assicurati che i dati siano binari float\n",
"mdata[\"dna\"].X = (mdata[\"dna\"].X > 0).astype(float)\n",
"\n",
"# Filtra varianti troppo rare\n",
"sc.pp.filter_genes(mdata[\"dna\"], min_cells=5)\n",
"\n",
"# Filtra varianti troppo frequenti (>95% delle cellule)\n",
"to_keep = np.array((mdata[\"dna\"].X.sum(axis=0) < 0.95 * mdata[\"dna\"].n_obs)).flatten()\n",
"mdata.mod[\"dna\"] = mdata[\"dna\"][:, to_keep].copy()\n",
"\n",
"# Scala per PCA\n",
"sc.pp.scale(mdata[\"dna\"], max_value=10)\n",
"\n",
"# PCA + neighbors + UMAP\n",
"sc.tl.pca(mdata[\"dna\"], n_comps=50)\n",
"sc.pp.neighbors(mdata[\"dna\"], n_pcs=30)\n",
"sc.tl.umap(mdata[\"dna\"])\n",
"\n",
"print(\"[INFO] Preprocessing DNA binario completato.\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "9a590752-5c73-4bdb-b55d-fb8932b7face",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] Carico metadati da: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\n[INFO] Trovate colonne cell_type (TrueCellType) e genotype (TrueGenotype)\n→ Colonna etichetta vera utilizzata: true_label"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"# Percorso opzionale del file di metadati (modificalo se serve)\n",
"meta_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\"\n",
"\n",
"true_label_col = None # nome della colonna con etichetta vera\n",
"\n",
"if os.path.exists(meta_csv):\n",
" print(f\"[INFO] Carico metadati da: {meta_csv}\")\n",
" metadata = pd.read_csv(meta_csv)\n",
"\n",
" # Tenta di riconoscere la colonna delle cellule\n",
" if \"Cell\" in metadata.columns:\n",
" metadata = metadata.set_index(\"Cell\")\n",
" elif \"cell_id\" in metadata.columns:\n",
" metadata = metadata.set_index(\"cell_id\")\n",
" else:\n",
" print(\"[WARN] Nessuna colonna 'Cell' trovata nei metadati — controlla il file.\")\n",
" print(\"Colonne disponibili:\", metadata.columns.tolist())\n",
"\n",
" # Aggiungi i metadati alle modalità RNA, DNA e alla vista combinata\n",
" for mod in [\"rna\", \"dna\"]:\n",
" mdata.mod[mod].obs = mdata.mod[mod].obs.join(metadata, how=\"left\")\n",
" mdata.obs = mdata.obs.join(metadata, how=\"left\")\n",
"\n",
" # Cerca possibili colonne che rappresentano la “verità” (label ground truth)\n",
" ct_cols = [c for c in [\"TrueCellType\", \"CellType\", \"cell_type\"] if c in mdata.obs.columns]\n",
" gt_cols = [c for c in [\"TrueGenotype\", \"Genotype\", \"genotype\"] if c in mdata.obs.columns]\n",
"\n",
" # Combina in una singola colonna \"true_label\"\n",
" if ct_cols and gt_cols:\n",
" mdata.obs[\"true_label\"] = (\n",
" mdata.obs[ct_cols[0]].astype(str) + \"_\" + mdata.obs[gt_cols[0]].astype(str)\n",
" )\n",
" true_label_col = \"true_label\"\n",
" print(f\"[INFO] Trovate colonne cell_type ({ct_cols[0]}) e genotype ({gt_cols[0]})\")\n",
" elif ct_cols:\n",
" mdata.obs[\"true_label\"] = mdata.obs[ct_cols[0]].astype(str)\n",
" true_label_col = \"true_label\"\n",
" print(f\"[INFO] Trovata colonna cell_type ({ct_cols[0]})\")\n",
" elif gt_cols:\n",
" mdata.obs[\"true_label\"] = mdata.obs[gt_cols[0]].astype(str)\n",
" true_label_col = \"true_label\"\n",
" print(f\"[INFO] Trovata colonna genotype ({gt_cols[0]})\")\n",
" else:\n",
" print(\"[WARN] Nessuna colonna adatta trovata per 'true_label'.\")\n",
"\n",
"else:\n",
" print(f\"[WARN] Nessun file di metadati trovato in {meta_csv}\")\n",
"\n",
"print(\"→ Colonna etichetta vera utilizzata:\", true_label_col)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "6ae58343-bc13-4cbc-b618-20ad608eff0d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] Costruisco grafo multimodale WNN...",
"[INFO] Calcolo UMAP su WNN...",
"[INFO] Leiden WNN alle risoluzioni: [0.5, 1.0, 1.5, 2.0]",
"[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res0.5.pdf",
"[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res1.0.pdf",
"[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res1.5.pdf",
"[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res2.0.pdf"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" method | \n",
" resolution | \n",
" ARI | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" WNN | \n",
" 0.5 | \n",
" 0.264106 | \n",
"
\n",
" \n",
" | 1 | \n",
" WNN | \n",
" 1.0 | \n",
" 0.263178 | \n",
"
\n",
" \n",
" | 2 | \n",
" WNN | \n",
" 1.5 | \n",
" 0.271365 | \n",
"
\n",
" \n",
" | 3 | \n",
" WNN | \n",
" 2.0 | \n",
" 0.311895 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" method resolution ARI\n",
"0 WNN 0.5 0.264106\n",
"1 WNN 1.0 0.263178\n",
"2 WNN 1.5 0.271365\n",
"3 WNN 2.0 0.311895"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/ARI_WNN_summary.csv",
"[INFO] Salvato MuData → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mdata_after_wnn.h5mu"
]
}
],
"source": [
"# --- BLOCCO 8: Integrazione WNN (RNA+DNA) + UMAP + Leiden + ARI ---\n",
"\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scanpy as sc\n",
"import muon as mu\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Cartelle output (riusa out_dir se già definita, altrimenti usa una di default)\n",
"if \"out_dir\" not in globals():\n",
" out_dir = \"./wnn_out/\"\n",
"os.makedirs(out_dir, exist_ok=True)\n",
"fig_dir = os.path.join(out_dir, \"figures_wnn\")\n",
"os.makedirs(fig_dir, exist_ok=True)\n",
"\n",
"# 0) Controlli di base sulle modalità\n",
"assert \"rna\" in mdata.mod and \"dna\" in mdata.mod, \"Mi servono le modalità 'rna' e 'dna' dentro mdata.mod\"\n",
"\n",
"print(\"[INFO] Costruisco grafo multimodale WNN...\")\n",
"mu.pp.neighbors(mdata, key_added=\"wnn\") # crea: mdata.uns['wnn'], mdata.obsp['wnn_connectivities'], mdata.obsp['wnn_distances']\n",
"\n",
"# 1) UMAP sul grafo WNN\n",
"print(\"[INFO] Calcolo UMAP su WNN...\")\n",
"mu.tl.umap(mdata, neighbors_key=\"wnn\")\n",
"\n",
"# Salva anche con un nome esplicito\n",
"mdata.obsm[\"X_wnn_umap\"] = mdata.obsm[\"X_umap\"].copy()\n",
"\n",
"# 2) Clustering Leiden su WNN a più risoluzioni\n",
"wnn_res_list = [0.5, 1.0, 1.5, 2.0]\n",
"print(f\"[INFO] Leiden WNN alle risoluzioni: {wnn_res_list}\")\n",
"for res in wnn_res_list:\n",
" key = f\"leiden_wnn_{res}\"\n",
" sc.tl.leiden(\n",
" mdata,\n",
" neighbors_key=\"wnn\",\n",
" resolution=res,\n",
" key_added=key,\n",
" flavor=\"igraph\",\n",
" n_iterations=2,\n",
" directed=False,\n",
" )\n",
"\n",
" # Plot UMAP colorata per questo clustering\n",
" fig, ax = plt.subplots(figsize=(6, 5))\n",
" sc.pl.embedding(\n",
" mdata,\n",
" basis=\"X_wnn_umap\",\n",
" color=key,\n",
" title=f\"WNN UMAP — Leiden res={res}\",\n",
" frameon=False,\n",
" ax=ax,\n",
" show=False,\n",
" )\n",
" out_path = os.path.join(fig_dir, f\"wnn_umap_leiden_res{res}.pdf\")\n",
" plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n",
" plt.close(fig)\n",
" print(f\"[INFO] Salvato plot → {out_path}\")\n",
"\n",
"# (Opzionale) Plot per “modality” se esiste in mdata.obs\n",
"if \"modality\" in mdata.obs.columns:\n",
" fig, ax = plt.subplots(figsize=(6, 5))\n",
" sc.pl.embedding(\n",
" mdata,\n",
" basis=\"X_wnn_umap\",\n",
" color=\"modality\",\n",
" title=\"WNN UMAP — modality\",\n",
" frameon=False,\n",
" ax=ax,\n",
" show=False,\n",
" )\n",
" out_path = os.path.join(fig_dir, \"wnn_umap_modality.pdf\")\n",
" plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n",
" plt.close(fig)\n",
" print(f\"[INFO] Salvato plot → {out_path}\")\n",
"\n",
"# 3) Calcolo ARI (se è stata preparata la colonna true_label nel Blocco 7)\n",
"def _maybe_ari(obs, pred_col, label_col=None):\n",
" if label_col is None or label_col not in obs.columns:\n",
" return np.nan\n",
" valid = obs[[label_col, pred_col]].dropna()\n",
" if valid.empty:\n",
" return np.nan\n",
" return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n",
"\n",
"from sklearn.metrics import adjusted_rand_score\n",
"\n",
"results = []\n",
"label_col = None\n",
"if \"true_label_col\" in globals() and true_label_col is not None:\n",
" label_col = true_label_col\n",
"elif \"true_label\" in mdata.obs.columns:\n",
" label_col = \"true_label\"\n",
"\n",
"if label_col is not None:\n",
" for res in wnn_res_list:\n",
" col = f\"leiden_wnn_{res}\"\n",
" ari = _maybe_ari(mdata.obs, col, label_col)\n",
" results.append({\"method\": \"WNN\", \"resolution\": res, \"ARI\": float(ari)})\n",
" res_df = pd.DataFrame(results).sort_values([\"resolution\"])\n",
" display(res_df)\n",
" csv_path = os.path.join(out_dir, \"ARI_WNN_summary.csv\")\n",
" res_df.to_csv(csv_path, index=False)\n",
" print(f\"[INFO] ARI salvati → {csv_path}\")\n",
"else:\n",
" print(\"[WARN] Nessuna colonna di etichette vere trovata (true_label). Salto il calcolo ARI.\")\n",
"\n",
"# 4) Salva il MuData aggiornato\n",
"mdata.write(os.path.join(out_dir, \"mdata_after_wnn.h5mu\"))\n",
"print(f\"[INFO] Salvato MuData → {os.path.join(out_dir, 'mdata_after_wnn.h5mu')}\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "3d0f7df4-f85a-473b-a844-d9b65cbeac2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] Inizio integrazione per concatenazione PCA (DEBUG 5×5 MODE)...\n\n[DEBUG] --- RNA ---\nShape (cells × features): (5000, 10000)\nPrime 5×5 celle rna (normalizzate e scalate):\n[[-4.5930e-01 -1.0819e+00 4.8890e-01 -3.9620e-01 1.5960e-01]\n [-1.0020e+00 -2.8540e-01 -1.4451e+00 -1.3384e+00 -3.0990e-01]\n [-5.4080e-01 1.1150e-01 1.3419e+00 9.8180e-01 -1.3718e+00]\n [ 1.0625e+00 1.9301e+00 1.1301e+00 -1.3384e+00 1.3454e+00]\n [-1.9660e-01 -8.3400e-02 -1.4451e+00 -1.3384e+00 -1.5000e-03]]\nPCA shape: (5000, 50), varianza media: 51.6740\nPrime 5x5 celle della PCA:\n[[-36.2213 28.7004 16.1661 -0.816 -3.7326]\n [-36.9042 26.9355 16.6614 -1.1801 3.8297]\n [ 1.8618 -3.6369 -18.9392 45.1308 1.4271]\n [ 6.9455 -32.0362 35.4417 -1.3034 -1.8827]\n [-37.3183 28.8054 16.4231 -1.7821 5.1176]]\nPCA variance ratio (prime 5): [0.0619 0.0602 0.0595 0.0569 0.0004]\n\n[DEBUG] --- DNA ---\nShape (cells × features): (5000, 20000)\nPrime 5×5 celle dna (normalizzate e scalate):\n[[-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]\n [-0.4074 -0.3807 -0.5594 -0.3896 -0.395 ]\n [ 2.4538 -0.3807 1.7872 -0.3896 -0.395 ]\n [-0.4074 2.6263 -0.5594 -0.3896 -0.395 ]\n [-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]]\nPCA shape: (5000, 50), varianza media: 54.3697\nPrime 5x5 celle della PCA:\n[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n [-21.7155 2.5589 52.835 3.8058 0.8215]\n [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\nPCA variance ratio (prime 5): [0.0513 0.0347 0.0315 0.0004 0.0004]\n\n[INFO] Celle allineate: 5000\n[INFO] Concatenazione PCA completata: (5000, 100)\n[DEBUG] Prime 5×5 celle CONCATENATE:\n[[-36.2213 28.7004 16.1661 -0.816 -3.7326]\n [-36.9042 26.9355 16.6614 -1.1801 3.8297]\n [ 1.8618 -3.6369 -18.9392 45.1308 1.4271]\n [ 6.9455 -32.0362 35.4417 -1.3034 -1.8827]\n [-37.3183 28.8054 16.4231 -1.7821 5.1176]]\n[DEBUG] Neighbors graph: shape=(5000, 5000), mean_conn=0.003063, nnz=111560\n[DEBUG] Neighbors graph: shape=(5000, 5000), mean_conn=0.081779, nnz=70000",
"[INFO] Calcolo Leiden per risoluzioni: [0.5, 1.0, 1.5, 2.0]\n[DEBUG] → leiden_concat_0.5: 20 clusters | mean=250.0 | min=40 | max=746",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res0.5.pdf\n[DEBUG] → leiden_concat_1: 20 clusters | mean=250.0 | min=40 | max=746",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res1.0.pdf\n[DEBUG] → leiden_concat_1.5: 20 clusters | mean=250.0 | min=40 | max=746",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res1.5.pdf\n[DEBUG] → leiden_concat_2: 20 clusters | mean=250.0 | min=40 | max=746",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res2.0.pdf\n\n=== RISULTATI ARI CONCAT (DEBUG 5×5) ===\n method resolution ARI\nConcatPCA 0.5 0.569959\nConcatPCA 1.0 0.569959\nConcatPCA 1.5 0.569959\nConcatPCA 2.0 0.569959\n[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/ARI_ConcatPCA_summary.csv",
"[INFO] Salvato MuData → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/mdata_after_concat.h5mu"
]
}
],
"source": [
"# --- BLOCCO 9: Integrazione concatenazione PCA (RNA+DNA) + Leiden + ARI (DEBUG 5×5 MODE) ---\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scanpy as sc\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import adjusted_rand_score\n",
"import os\n",
"\n",
"concat_dir = os.path.join(out_dir, \"concat_pca\")\n",
"os.makedirs(concat_dir, exist_ok=True)\n",
"fig_dir = os.path.join(concat_dir, \"figures_concat\")\n",
"os.makedirs(fig_dir, exist_ok=True)\n",
"\n",
"print(\"[INFO] Inizio integrazione per concatenazione PCA (DEBUG 5×5 MODE)...\")\n",
"\n",
"# === 1️⃣ RNA e DNA PCA check ===\n",
"for mod in [\"rna\", \"dna\"]:\n",
" if \"X_pca\" not in mdata[mod].obsm:\n",
" raise ValueError(f\"[ERROR] Manca PCA per {mod}! Riesegui sc.tl.pca.\")\n",
" \n",
" print(f\"\\n[DEBUG] --- {mod.upper()} ---\")\n",
" print(f\"Shape (cells × features): {mdata[mod].shape}\")\n",
"\n",
" # Prime 5×5 celle della matrice normalizzata\n",
" X = mdata[mod].X\n",
" block_raw = X[:5, :5].A if hasattr(X, \"A\") else np.asarray(X[:5, :5])\n",
" print(f\"Prime 5×5 celle {mod} (normalizzate e scalate):\\n{np.round(block_raw, 4)}\")\n",
"\n",
" # PCA info# PCA info\n",
" X_pca = mdata[mod].obsm[\"X_pca\"]\n",
" print(f\"PCA shape: {X_pca.shape}, varianza media: {np.var(X_pca):.4f}\")\n",
"\n",
" # Mostra le prime 5 righe e 5 colonne della PCA\n",
" print(\"Prime 5x5 celle della PCA:\")\n",
" print(np.round(X_pca[:5, :5], 4))\n",
"\n",
" # Varianza spiegata (se disponibile)\n",
" if \"pca\" in mdata[mod].uns and \"variance_ratio\" in mdata[mod].uns[\"pca\"]:\n",
" vr = mdata[mod].uns[\"pca\"][\"variance_ratio\"]\n",
" print(f\"PCA variance ratio (prime 5): {np.round(vr[:5], 4)}\")\n",
" \n",
"# === 2️⃣ Estrai PCA e concatena ===\n",
"pca_rna = mdata[\"rna\"].obsm[\"X_pca\"]\n",
"pca_dna = mdata[\"dna\"].obsm[\"X_pca\"]\n",
"\n",
"if pca_rna.shape[0] != pca_dna.shape[0]:\n",
" raise ValueError(f\"[ERROR] RNA e DNA hanno celle diverse: RNA={pca_rna.shape[0]} DNA={pca_dna.shape[0]}\")\n",
"\n",
"print(f\"\\n[INFO] Celle allineate: {pca_rna.shape[0]}\")\n",
"X_concat = np.concatenate([pca_rna, pca_dna], axis=1)\n",
"mdata.obsm[\"X_concat_pca\"] = X_concat\n",
"\n",
"print(f\"[INFO] Concatenazione PCA completata: {X_concat.shape}\")\n",
"print(f\"[DEBUG] Prime 5×5 celle CONCATENATE:\\n{np.round(X_concat[:5, :5], 4)}\")\n",
"\n",
"# === 3️⃣ Neighbors, UMAP, Leiden ===\n",
"sc.pp.neighbors(mdata, use_rep=\"X_concat_pca\")\n",
"conn = mdata.obsp[\"connectivities\"]\n",
"dist = mdata.obsp[\"distances\"]\n",
"print(f\"[DEBUG] Neighbors graph: shape={conn.shape}, mean_conn={conn.mean():.6f}, nnz={conn.nnz}\")\n",
"print(f\"[DEBUG] Neighbors graph: shape={dist.shape}, mean_conn={dist.mean():.6f}, nnz={dist.nnz}\")\n",
"\n",
"\n",
"sc.tl.umap(mdata)\n",
"mdata.obsm[\"X_concat_umap\"] = mdata.obsm[\"X_umap\"].copy()\n",
"\n",
"concat_res_list = [0.5, 1.0, 1.5, 2.0]\n",
"print(f\"[INFO] Calcolo Leiden per risoluzioni: {concat_res_list}\")\n",
"\n",
"for res in concat_res_list:\n",
" key = f\"leiden_concat_{res:.2f}\".rstrip(\"0\").rstrip(\".\")\n",
" sc.tl.leiden(\n",
" mdata,\n",
" resolution=res,\n",
" key_added=key,\n",
" flavor=\"igraph\",\n",
" n_iterations=2,\n",
" directed=False,\n",
" random_state=0, # deterministic\n",
" )\n",
"\n",
" # Debug cluster info\n",
" n_clusters = mdata.obs[key].nunique()\n",
" sizes = mdata.obs[key].value_counts().sort_index().values\n",
" print(f\"[DEBUG] → {key}: {n_clusters} clusters | mean={np.mean(sizes):.1f} | min={np.min(sizes)} | max={np.max(sizes)}\")\n",
"\n",
" # UMAP plot\n",
" fig, ax = plt.subplots(figsize=(6, 5))\n",
" sc.pl.embedding(\n",
" mdata,\n",
" basis=\"X_concat_umap\",\n",
" color=key,\n",
" title=f\"Concat PCA UMAP — Leiden res={res}\",\n",
" frameon=False,\n",
" ax=ax,\n",
" show=False,\n",
" )\n",
" out_path = os.path.join(fig_dir, f\"concat_umap_leiden_res{res}.pdf\")\n",
" plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n",
" plt.close(fig)\n",
" print(f\"[INFO] Salvato → {out_path}\")\n",
"\n",
"# === 4️⃣ ARI (se presente true_label) ===\n",
"label_col = None\n",
"if \"true_label_col\" in globals() and true_label_col is not None:\n",
" label_col = true_label_col\n",
"elif \"true_label\" in mdata.obs.columns:\n",
" label_col = \"true_label\"\n",
"\n",
"def _maybe_ari(obs, pred_col, label_col=None):\n",
" if label_col is None or label_col not in obs.columns:\n",
" return np.nan\n",
" valid = obs[[label_col, pred_col]].dropna()\n",
" if valid.empty:\n",
" return np.nan\n",
" return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n",
"\n",
"results_concat = []\n",
"if label_col is not None:\n",
" for res in concat_res_list:\n",
" col = f\"leiden_concat_{res:.2f}\".rstrip(\"0\").rstrip(\".\")\n",
" ari = _maybe_ari(mdata.obs, col, label_col)\n",
" results_concat.append({\"method\": \"ConcatPCA\", \"resolution\": res, \"ARI\": float(ari)})\n",
" df_concat = pd.DataFrame(results_concat).sort_values(\"resolution\")\n",
" print(\"\\n=== RISULTATI ARI CONCAT (DEBUG 5×5) ===\")\n",
" print(df_concat.to_string(index=False))\n",
" csv_path = os.path.join(concat_dir, \"ARI_ConcatPCA_summary.csv\")\n",
" df_concat.to_csv(csv_path, index=False)\n",
" print(f\"[INFO] ARI salvati → {csv_path}\")\n",
"else:\n",
" print(\"[WARN] Nessuna colonna true_label trovata, salto ARI per Concat PCA.\")\n",
"\n",
"# === 5️⃣ Salvataggio MuData ===\n",
"out_path = os.path.join(concat_dir, \"mdata_after_concat.h5mu\")\n",
"mdata.write(out_path)\n",
"print(f\"[INFO] Salvato MuData → {out_path}\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "05047a95-cc55-4cbb-abad-a336c2074c6e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] Likelihood per vista: {'rna': 'gaussian', 'dna': 'gaussian'}\n[INFO] Likelihoods usate in ordine: ['gaussian', 'gaussian']\n[INFO] Avvio MOFA con n_factors=10...\n\n #########################################################\n ### __ __ ____ ______ ### \n ### | \\/ |/ __ \\| ____/\\ _ ### \n ### | \\ / | | | | |__ / \\ _| |_ ### \n ### | |\\/| | | | | __/ /\\ \\_ _| ###\n ### | | | | |__| | | / ____ \\|_| ###\n ### |_| |_|\\____/|_|/_/ \\_\\ ###\n ### ### \n ######################################################### \n \n \n ",
"Loaded view='rna' group='group1' with N=5000 samples and D=10000 features...\nLoaded view='dna' group='group1' with N=5000 samples and D=20000 features...\n\n",
"Model options:\n- Automatic Relevance Determination prior on the factors: True\n- Automatic Relevance Determination prior on the weights: True\n- Spike-and-slab prior on the factors: False\n- Spike-and-slab prior on the weights: True\nLikelihoods:\n- View 0 (rna): gaussian\n- View 1 (dna): gaussian\n\n",
"\n\n######################################\n## Training the model with seed 0 ##\n######################################\n\n",
"\nConverged!\n\n\n\n#######################\n## Training finished ##\n#######################\n\n\nSaving model in /tmp/mofa_20251125-223303.hdf5...",
"Saved MOFA embeddings in .obsm['X_mofa'] slot and their loadings in .varm['LFs'].\n[INFO] Calcolo neighbors e UMAP su fattori MOFA...",
"[INFO] Calcolo Leiden per risoluzioni: [0.5, 1.0, 1.5, 2.0]",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res0.5.pdf",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res1.0.pdf",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res1.5.pdf",
"[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res2.0.pdf"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" method | \n",
" resolution | \n",
" ARI | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" MOFA | \n",
" 0.5 | \n",
" 0.569959 | \n",
"
\n",
" \n",
" | 1 | \n",
" MOFA | \n",
" 1.0 | \n",
" 0.569959 | \n",
"
\n",
" \n",
" | 2 | \n",
" MOFA | \n",
" 1.5 | \n",
" 0.569959 | \n",
"
\n",
" \n",
" | 3 | \n",
" MOFA | \n",
" 2.0 | \n",
" 0.420227 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" method resolution ARI\n",
"0 MOFA 0.5 0.569959\n",
"1 MOFA 1.0 0.569959\n",
"2 MOFA 1.5 0.569959\n",
"3 MOFA 2.0 0.420227"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/ARI_MOFA_summary.csv",
"[INFO] Salvato MuData finale → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/mdata_after_mofa.h5mu"
]
}
],
"source": [
"# --- BLOCCO 10: Integrazione MOFA(+) + UMAP + Leiden + ARI ---\n",
"\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scanpy as sc\n",
"import muon as mu\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import adjusted_rand_score\n",
"\n",
"mofa_dir = os.path.join(out_dir, \"mofa\")\n",
"os.makedirs(mofa_dir, exist_ok=True)\n",
"fig_dir = os.path.join(mofa_dir, \"figures_mofa\")\n",
"os.makedirs(fig_dir, exist_ok=True)\n",
"\n",
"# 0️⃣ Verifica che mofapy2 sia installato\n",
"try:\n",
" import mofapy2 # noqa: F401\n",
"except ImportError:\n",
" raise ImportError(\"Devi installare mofapy2 prima di usare MOFA → pip install mofapy2\")\n",
"\n",
"# 1️⃣ Imposta likelihood per ciascuna vista\n",
"likelihoods = {}\n",
"for view in mdata.mod.keys():\n",
" X = mdata.mod[view].X\n",
" try:\n",
" vals = np.unique(X.data if hasattr(X, \"data\") else np.asarray(X).ravel())\n",
" is_binary = np.all(np.isin(vals, [0, 1]))\n",
" except Exception:\n",
" is_binary = False\n",
"\n",
" if view.lower() in (\"rna\", \"gex\", \"gene\", \"transcriptome\"):\n",
" likelihoods[view] = \"gaussian\"\n",
" elif is_binary:\n",
" likelihoods[view] = \"bernoulli\"\n",
" else:\n",
" likelihoods[view] = \"gaussian\"\n",
"\n",
"print(\"[INFO] Likelihood per vista:\", likelihoods)\n",
"\n",
"# ➕ Converti in LISTA nell'ordine delle viste in mdata.mod\n",
"likelihoods_list = [likelihoods[view] for view in mdata.mod.keys()]\n",
"print(\"[INFO] Likelihoods usate in ordine:\", likelihoods_list)\n",
"\n",
"# 2️⃣ Esegui MOFA\n",
"n_factors = 10\n",
"print(f\"[INFO] Avvio MOFA con n_factors={n_factors}...\")\n",
"mu.tl.mofa(\n",
" mdata,\n",
" n_factors=n_factors,\n",
" likelihoods=likelihoods_list, # ✅ usa lista, non dict\n",
" convergence_mode=\"medium\",\n",
" seed=0,\n",
")\n",
"\n",
"# I fattori vengono salvati in mdata.obsm[\"X_mofa\"]\n",
"assert \"X_mofa\" in mdata.obsm, \"MOFA non ha prodotto X_mofa; controlla i log di mofapy2\"\n",
"\n",
"# 3️⃣ Costruisci grafo e UMAP nello spazio MOFA\n",
"print(\"[INFO] Calcolo neighbors e UMAP su fattori MOFA...\")\n",
"sc.pp.neighbors(mdata, use_rep=\"X_mofa\")\n",
"sc.tl.umap(mdata)\n",
"mdata.obsm[\"X_mofa_umap\"] = mdata.obsm[\"X_umap\"].copy()\n",
"\n",
"# 4️⃣ Leiden multi-risoluzione\n",
"mofa_res_list = [0.5, 1.0, 1.5, 2.0]\n",
"print(f\"[INFO] Calcolo Leiden per risoluzioni: {mofa_res_list}\")\n",
"for res in mofa_res_list:\n",
" key = f\"leiden_mofa_{res}\"\n",
" sc.tl.leiden(\n",
" mdata,\n",
" resolution=res,\n",
" key_added=key,\n",
" flavor=\"igraph\",\n",
" n_iterations=2,\n",
" directed=False,\n",
" )\n",
"\n",
" # Plot\n",
" fig, ax = plt.subplots(figsize=(6, 5))\n",
" sc.pl.embedding(\n",
" mdata,\n",
" basis=\"X_mofa_umap\",\n",
" color=key,\n",
" title=f\"MOFA UMAP — Leiden res={res}\",\n",
" frameon=False,\n",
" ax=ax,\n",
" show=False,\n",
" )\n",
" out_path = os.path.join(fig_dir, f\"mofa_umap_leiden_res{res}.pdf\")\n",
" plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n",
" plt.close(fig)\n",
" print(f\"[INFO] Salvato → {out_path}\")\n",
"\n",
"# 5️⃣ Calcolo ARI (se esiste una colonna di verità)\n",
"def _maybe_ari(obs, pred_col, label_col=None):\n",
" if label_col is None or label_col not in obs.columns:\n",
" return np.nan\n",
" valid = obs[[label_col, pred_col]].dropna()\n",
" if valid.empty:\n",
" return np.nan\n",
" return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n",
"\n",
"label_col = None\n",
"if \"true_label_col\" in globals() and true_label_col is not None:\n",
" label_col = true_label_col\n",
"elif \"true_label\" in mdata.obs.columns:\n",
" label_col = \"true_label\"\n",
"\n",
"results_mofa = []\n",
"if label_col is not None:\n",
" for res in mofa_res_list:\n",
" col = f\"leiden_mofa_{res}\"\n",
" ari = _maybe_ari(mdata.obs, col, label_col)\n",
" results_mofa.append({\"method\": \"MOFA\", \"resolution\": res, \"ARI\": float(ari)})\n",
" df_mofa = pd.DataFrame(results_mofa).sort_values(\"resolution\")\n",
" display(df_mofa)\n",
" csv_path = os.path.join(mofa_dir, \"ARI_MOFA_summary.csv\")\n",
" df_mofa.to_csv(csv_path, index=False)\n",
" print(f\"[INFO] ARI salvati → {csv_path}\")\n",
"else:\n",
" print(\"[WARN] Nessuna colonna true_label trovata: salto ARI per MOFA.\")\n",
"\n",
"# 6️⃣ Salvataggio finale del MuData aggiornato\n",
"out_path = os.path.join(mofa_dir, \"mdata_after_mofa.h5mu\")\n",
"mdata.write(out_path)\n",
"print(f\"[INFO] Salvato MuData finale → {out_path}\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "40b9c845-6a75-4167-8d0d-21a5b5b99d04",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a77cd0f2-4b08-44e2-9422-6fb664613677",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5710a4fe-6d4d-4894-a5c0-c620160836ff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "127da0bc-d1c6-4c5a-9e0d-c601b1b9b9be",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f605a6d7-c6e2-48e4-8510-f75266de9660",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c31cbe4-e48a-4e23-abfc-6a4cff80842b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:scvar_env]",
"language": "python",
"name": "conda-env-scvar_env-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}