{ "cells": [ { "cell_type": "code", "execution_count": 11, "id": "2fe6811b-dc90-4869-880c-3686f5da116f", "metadata": {}, "outputs": [], "source": [ "# !pip install muon scanpy mofapy2 # se servisse\n", "import os\n", "out_dir = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/\"\n", "os.makedirs(out_dir, exist_ok=True)\n", "\n", "# Config Scanpy\n", "import scanpy as sc\n", "sc.settings.figdir = os.path.join(out_dir, \"figures\")\n", "os.makedirs(sc.settings.figdir, exist_ok=True)\n", "sc.settings.autoshow = False" ] }, { "cell_type": "code", "execution_count": 12, "id": "09df59a3-8faf-48f7-bbc5-102b750134ab", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scanpy as sc\n", "import muon as mu\n", "from anndata import AnnData\n", "from sklearn.metrics import adjusted_rand_score\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 13, "id": "8fa241bd-d809-4c1b-a18d-90462ec6e255", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RNA shape (genes × cells): (10000, 5000)\nDNA shape (variants × cells): (20000, 5000)", "\n[DEBUG] Prime 5×5 celle RNA (righe=celle, colonne=geni):\n[[1.5445 0. 3.065 1.8446 2.3458]\n [0. 1.4837 0. 0. 1.6266]\n [1.3126 2.223 4.4167 4.5424 0. ]\n [5.8761 5.6104 4.0811 0. 4.1624]\n [2.2925 1.86 0. 0. 2.099 ]]\n\n[DEBUG] Prime 5×5 celle DNA (righe=celle, colonne=varianti):\n[[0 0 1 0 0]\n [0 0 0 0 0]\n [1 0 1 0 0]\n [0 1 0 0 0]\n [0 0 1 0 0]]", "\n[INFO] RNA — valore medio: 2.1562, std: 1.9827", "[INFO] DNA — valore medio: 0.1973, std: 0.3979" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from anndata import AnnData\n", "\n", "rna_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/gene_expression.csv\"\n", "dna_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/variant_matrix.csv\"\n", "\n", "rna_df = pd.read_csv(rna_csv, index_col=0)\n", "dna_df = pd.read_csv(dna_csv, index_col=0)\n", "\n", "print(f\"RNA shape (genes × cells): {rna_df.shape}\")\n", "print(f\"DNA shape (variants × cells): {dna_df.shape}\")\n", "\n", "# Crea AnnData\n", "rna_adata = AnnData(rna_df.T.copy())\n", "dna_adata = AnnData(dna_df.T.copy())\n", "\n", "rna_adata.var_names.name = \"gene\"\n", "dna_adata.var_names.name = \"variant\"\n", "\n", "# Verifica coerenza dei barcode\n", "assert (rna_adata.obs_names == dna_adata.obs_names).all(), \"Cell names don't match!\"\n", "\n", "# === Debug: prime 5×5 celle per ciascuna omica ===\n", "print(\"\\n[DEBUG] Prime 5×5 celle RNA (righe=celle, colonne=geni):\")\n", "print(np.round(rna_adata.X[:5, :5], 4))\n", "\n", "print(\"\\n[DEBUG] Prime 5×5 celle DNA (righe=celle, colonne=varianti):\")\n", "print(np.round(dna_adata.X[:5, :5], 4))\n", "\n", "# === Piccolo riassunto dei valori ===\n", "print(f\"\\n[INFO] RNA — valore medio: {rna_adata.X.mean():.4f}, std: {rna_adata.X.std():.4f}\")\n", "print(f\"[INFO] DNA — valore medio: {dna_adata.X.mean():.4f}, std: {dna_adata.X.std():.4f}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "b7824e6e-e6c4-4281-a4da-5a1b827fa000", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MuData object with n_obs × n_vars = 5000 × 30000\n 2 modalities\n rna:\t5000 x 10000\n dna:\t5000 x 20000" ] } ], "source": [ "mdata = mu.MuData({\"rna\": rna_adata, \"dna\": dna_adata})\n", "mdata.write(os.path.join(out_dir, \"mdata_raw.h5mu\"))\n", "print(mdata)" ] }, { "cell_type": "code", "execution_count": 15, "id": "90ce219f-7613-4e6d-8675-d81b1b6cc6a7", "metadata": {}, "outputs": [], "source": [ "#sc.pp.normalize_total(mdata[\"rna\"])\n", "#sc.pp.log1p(mdata[\"rna\"])\n", "#sc.pp.highly_variable_genes(mdata[\"rna\"], flavor=\"cell_ranger\", n_top_genes=2000)\n", "#mdata.mod[\"rna\"] = mdata[\"rna\"][:, mdata[\"rna\"].var.highly_variable].copy()\n", "sc.pp.scale(mdata[\"rna\"], max_value=10)\n", "sc.tl.pca(mdata[\"rna\"], n_comps=50)\n", "sc.pp.neighbors(mdata[\"rna\"], n_pcs=30)\n", "sc.tl.umap(mdata[\"rna\"])" ] }, { "cell_type": "code", "execution_count": 16, "id": "c44f00fe-fb8e-4fb3-8a85-0766d262c03e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Preprocessing DNA binario completato." ] } ], "source": [ "import numpy as np\n", "import scanpy as sc\n", "\n", "# Assicurati che i dati siano binari float\n", "mdata[\"dna\"].X = (mdata[\"dna\"].X > 0).astype(float)\n", "\n", "# Filtra varianti troppo rare\n", "sc.pp.filter_genes(mdata[\"dna\"], min_cells=5)\n", "\n", "# Filtra varianti troppo frequenti (>95% delle cellule)\n", "to_keep = np.array((mdata[\"dna\"].X.sum(axis=0) < 0.95 * mdata[\"dna\"].n_obs)).flatten()\n", "mdata.mod[\"dna\"] = mdata[\"dna\"][:, to_keep].copy()\n", "\n", "# Scala per PCA\n", "sc.pp.scale(mdata[\"dna\"], max_value=10)\n", "\n", "# PCA + neighbors + UMAP\n", "sc.tl.pca(mdata[\"dna\"], n_comps=50)\n", "sc.pp.neighbors(mdata[\"dna\"], n_pcs=30)\n", "sc.tl.umap(mdata[\"dna\"])\n", "\n", "print(\"[INFO] Preprocessing DNA binario completato.\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "9a590752-5c73-4bdb-b55d-fb8932b7face", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Carico metadati da: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\n[INFO] Trovate colonne cell_type (TrueCellType) e genotype (TrueGenotype)\n→ Colonna etichetta vera utilizzata: true_label" ] } ], "source": [ "import pandas as pd\n", "import os\n", "\n", "# Percorso opzionale del file di metadati (modificalo se serve)\n", "meta_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\"\n", "\n", "true_label_col = None # nome della colonna con etichetta vera\n", "\n", "if os.path.exists(meta_csv):\n", " print(f\"[INFO] Carico metadati da: {meta_csv}\")\n", " metadata = pd.read_csv(meta_csv)\n", "\n", " # Tenta di riconoscere la colonna delle cellule\n", " if \"Cell\" in metadata.columns:\n", " metadata = metadata.set_index(\"Cell\")\n", " elif \"cell_id\" in metadata.columns:\n", " metadata = metadata.set_index(\"cell_id\")\n", " else:\n", " print(\"[WARN] Nessuna colonna 'Cell' trovata nei metadati — controlla il file.\")\n", " print(\"Colonne disponibili:\", metadata.columns.tolist())\n", "\n", " # Aggiungi i metadati alle modalità RNA, DNA e alla vista combinata\n", " for mod in [\"rna\", \"dna\"]:\n", " mdata.mod[mod].obs = mdata.mod[mod].obs.join(metadata, how=\"left\")\n", " mdata.obs = mdata.obs.join(metadata, how=\"left\")\n", "\n", " # Cerca possibili colonne che rappresentano la “verità” (label ground truth)\n", " ct_cols = [c for c in [\"TrueCellType\", \"CellType\", \"cell_type\"] if c in mdata.obs.columns]\n", " gt_cols = [c for c in [\"TrueGenotype\", \"Genotype\", \"genotype\"] if c in mdata.obs.columns]\n", "\n", " # Combina in una singola colonna \"true_label\"\n", " if ct_cols and gt_cols:\n", " mdata.obs[\"true_label\"] = (\n", " mdata.obs[ct_cols[0]].astype(str) + \"_\" + mdata.obs[gt_cols[0]].astype(str)\n", " )\n", " true_label_col = \"true_label\"\n", " print(f\"[INFO] Trovate colonne cell_type ({ct_cols[0]}) e genotype ({gt_cols[0]})\")\n", " elif ct_cols:\n", " mdata.obs[\"true_label\"] = mdata.obs[ct_cols[0]].astype(str)\n", " true_label_col = \"true_label\"\n", " print(f\"[INFO] Trovata colonna cell_type ({ct_cols[0]})\")\n", " elif gt_cols:\n", " mdata.obs[\"true_label\"] = mdata.obs[gt_cols[0]].astype(str)\n", " true_label_col = \"true_label\"\n", " print(f\"[INFO] Trovata colonna genotype ({gt_cols[0]})\")\n", " else:\n", " print(\"[WARN] Nessuna colonna adatta trovata per 'true_label'.\")\n", "\n", "else:\n", " print(f\"[WARN] Nessun file di metadati trovato in {meta_csv}\")\n", "\n", "print(\"→ Colonna etichetta vera utilizzata:\", true_label_col)" ] }, { "cell_type": "code", "execution_count": 18, "id": "6ae58343-bc13-4cbc-b618-20ad608eff0d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Costruisco grafo multimodale WNN...", "[INFO] Calcolo UMAP su WNN...", "[INFO] Leiden WNN alle risoluzioni: [0.5, 1.0, 1.5, 2.0]", "[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res0.5.pdf", "[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res1.0.pdf", "[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res1.5.pdf", "[INFO] Salvato plot → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/figures_wnn/wnn_umap_leiden_res2.0.pdf" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
methodresolutionARI
0WNN0.50.264106
1WNN1.00.263178
2WNN1.50.271365
3WNN2.00.311895
\n", "
" ], "text/plain": [ " method resolution ARI\n", "0 WNN 0.5 0.264106\n", "1 WNN 1.0 0.263178\n", "2 WNN 1.5 0.271365\n", "3 WNN 2.0 0.311895" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/ARI_WNN_summary.csv", "[INFO] Salvato MuData → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mdata_after_wnn.h5mu" ] } ], "source": [ "# --- BLOCCO 8: Integrazione WNN (RNA+DNA) + UMAP + Leiden + ARI ---\n", "\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "import muon as mu\n", "import matplotlib.pyplot as plt\n", "\n", "# Cartelle output (riusa out_dir se già definita, altrimenti usa una di default)\n", "if \"out_dir\" not in globals():\n", " out_dir = \"./wnn_out/\"\n", "os.makedirs(out_dir, exist_ok=True)\n", "fig_dir = os.path.join(out_dir, \"figures_wnn\")\n", "os.makedirs(fig_dir, exist_ok=True)\n", "\n", "# 0) Controlli di base sulle modalità\n", "assert \"rna\" in mdata.mod and \"dna\" in mdata.mod, \"Mi servono le modalità 'rna' e 'dna' dentro mdata.mod\"\n", "\n", "print(\"[INFO] Costruisco grafo multimodale WNN...\")\n", "mu.pp.neighbors(mdata, key_added=\"wnn\") # crea: mdata.uns['wnn'], mdata.obsp['wnn_connectivities'], mdata.obsp['wnn_distances']\n", "\n", "# 1) UMAP sul grafo WNN\n", "print(\"[INFO] Calcolo UMAP su WNN...\")\n", "mu.tl.umap(mdata, neighbors_key=\"wnn\")\n", "\n", "# Salva anche con un nome esplicito\n", "mdata.obsm[\"X_wnn_umap\"] = mdata.obsm[\"X_umap\"].copy()\n", "\n", "# 2) Clustering Leiden su WNN a più risoluzioni\n", "wnn_res_list = [0.5, 1.0, 1.5, 2.0]\n", "print(f\"[INFO] Leiden WNN alle risoluzioni: {wnn_res_list}\")\n", "for res in wnn_res_list:\n", " key = f\"leiden_wnn_{res}\"\n", " sc.tl.leiden(\n", " mdata,\n", " neighbors_key=\"wnn\",\n", " resolution=res,\n", " key_added=key,\n", " flavor=\"igraph\",\n", " n_iterations=2,\n", " directed=False,\n", " )\n", "\n", " # Plot UMAP colorata per questo clustering\n", " fig, ax = plt.subplots(figsize=(6, 5))\n", " sc.pl.embedding(\n", " mdata,\n", " basis=\"X_wnn_umap\",\n", " color=key,\n", " title=f\"WNN UMAP — Leiden res={res}\",\n", " frameon=False,\n", " ax=ax,\n", " show=False,\n", " )\n", " out_path = os.path.join(fig_dir, f\"wnn_umap_leiden_res{res}.pdf\")\n", " plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n", " plt.close(fig)\n", " print(f\"[INFO] Salvato plot → {out_path}\")\n", "\n", "# (Opzionale) Plot per “modality” se esiste in mdata.obs\n", "if \"modality\" in mdata.obs.columns:\n", " fig, ax = plt.subplots(figsize=(6, 5))\n", " sc.pl.embedding(\n", " mdata,\n", " basis=\"X_wnn_umap\",\n", " color=\"modality\",\n", " title=\"WNN UMAP — modality\",\n", " frameon=False,\n", " ax=ax,\n", " show=False,\n", " )\n", " out_path = os.path.join(fig_dir, \"wnn_umap_modality.pdf\")\n", " plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n", " plt.close(fig)\n", " print(f\"[INFO] Salvato plot → {out_path}\")\n", "\n", "# 3) Calcolo ARI (se è stata preparata la colonna true_label nel Blocco 7)\n", "def _maybe_ari(obs, pred_col, label_col=None):\n", " if label_col is None or label_col not in obs.columns:\n", " return np.nan\n", " valid = obs[[label_col, pred_col]].dropna()\n", " if valid.empty:\n", " return np.nan\n", " return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n", "\n", "from sklearn.metrics import adjusted_rand_score\n", "\n", "results = []\n", "label_col = None\n", "if \"true_label_col\" in globals() and true_label_col is not None:\n", " label_col = true_label_col\n", "elif \"true_label\" in mdata.obs.columns:\n", " label_col = \"true_label\"\n", "\n", "if label_col is not None:\n", " for res in wnn_res_list:\n", " col = f\"leiden_wnn_{res}\"\n", " ari = _maybe_ari(mdata.obs, col, label_col)\n", " results.append({\"method\": \"WNN\", \"resolution\": res, \"ARI\": float(ari)})\n", " res_df = pd.DataFrame(results).sort_values([\"resolution\"])\n", " display(res_df)\n", " csv_path = os.path.join(out_dir, \"ARI_WNN_summary.csv\")\n", " res_df.to_csv(csv_path, index=False)\n", " print(f\"[INFO] ARI salvati → {csv_path}\")\n", "else:\n", " print(\"[WARN] Nessuna colonna di etichette vere trovata (true_label). Salto il calcolo ARI.\")\n", "\n", "# 4) Salva il MuData aggiornato\n", "mdata.write(os.path.join(out_dir, \"mdata_after_wnn.h5mu\"))\n", "print(f\"[INFO] Salvato MuData → {os.path.join(out_dir, 'mdata_after_wnn.h5mu')}\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "3d0f7df4-f85a-473b-a844-d9b65cbeac2e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Inizio integrazione per concatenazione PCA (DEBUG 5×5 MODE)...\n\n[DEBUG] --- RNA ---\nShape (cells × features): (5000, 10000)\nPrime 5×5 celle rna (normalizzate e scalate):\n[[-4.5930e-01 -1.0819e+00 4.8890e-01 -3.9620e-01 1.5960e-01]\n [-1.0020e+00 -2.8540e-01 -1.4451e+00 -1.3384e+00 -3.0990e-01]\n [-5.4080e-01 1.1150e-01 1.3419e+00 9.8180e-01 -1.3718e+00]\n [ 1.0625e+00 1.9301e+00 1.1301e+00 -1.3384e+00 1.3454e+00]\n [-1.9660e-01 -8.3400e-02 -1.4451e+00 -1.3384e+00 -1.5000e-03]]\nPCA shape: (5000, 50), varianza media: 51.6740\nPrime 5x5 celle della PCA:\n[[-36.2213 28.7004 16.1661 -0.816 -3.7326]\n [-36.9042 26.9355 16.6614 -1.1801 3.8297]\n [ 1.8618 -3.6369 -18.9392 45.1308 1.4271]\n [ 6.9455 -32.0362 35.4417 -1.3034 -1.8827]\n [-37.3183 28.8054 16.4231 -1.7821 5.1176]]\nPCA variance ratio (prime 5): [0.0619 0.0602 0.0595 0.0569 0.0004]\n\n[DEBUG] --- DNA ---\nShape (cells × features): (5000, 20000)\nPrime 5×5 celle dna (normalizzate e scalate):\n[[-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]\n [-0.4074 -0.3807 -0.5594 -0.3896 -0.395 ]\n [ 2.4538 -0.3807 1.7872 -0.3896 -0.395 ]\n [-0.4074 2.6263 -0.5594 -0.3896 -0.395 ]\n [-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]]\nPCA shape: (5000, 50), varianza media: 54.3697\nPrime 5x5 celle della PCA:\n[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n [-21.7155 2.5589 52.835 3.8058 0.8215]\n [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\nPCA variance ratio (prime 5): [0.0513 0.0347 0.0315 0.0004 0.0004]\n\n[INFO] Celle allineate: 5000\n[INFO] Concatenazione PCA completata: (5000, 100)\n[DEBUG] Prime 5×5 celle CONCATENATE:\n[[-36.2213 28.7004 16.1661 -0.816 -3.7326]\n [-36.9042 26.9355 16.6614 -1.1801 3.8297]\n [ 1.8618 -3.6369 -18.9392 45.1308 1.4271]\n [ 6.9455 -32.0362 35.4417 -1.3034 -1.8827]\n [-37.3183 28.8054 16.4231 -1.7821 5.1176]]\n[DEBUG] Neighbors graph: shape=(5000, 5000), mean_conn=0.003063, nnz=111560\n[DEBUG] Neighbors graph: shape=(5000, 5000), mean_conn=0.081779, nnz=70000", "[INFO] Calcolo Leiden per risoluzioni: [0.5, 1.0, 1.5, 2.0]\n[DEBUG] → leiden_concat_0.5: 20 clusters | mean=250.0 | min=40 | max=746", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res0.5.pdf\n[DEBUG] → leiden_concat_1: 20 clusters | mean=250.0 | min=40 | max=746", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res1.0.pdf\n[DEBUG] → leiden_concat_1.5: 20 clusters | mean=250.0 | min=40 | max=746", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res1.5.pdf\n[DEBUG] → leiden_concat_2: 20 clusters | mean=250.0 | min=40 | max=746", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/figures_concat/concat_umap_leiden_res2.0.pdf\n\n=== RISULTATI ARI CONCAT (DEBUG 5×5) ===\n method resolution ARI\nConcatPCA 0.5 0.569959\nConcatPCA 1.0 0.569959\nConcatPCA 1.5 0.569959\nConcatPCA 2.0 0.569959\n[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/ARI_ConcatPCA_summary.csv", "[INFO] Salvato MuData → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/concat_pca/mdata_after_concat.h5mu" ] } ], "source": [ "# --- BLOCCO 9: Integrazione concatenazione PCA (RNA+DNA) + Leiden + ARI (DEBUG 5×5 MODE) ---\n", "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import adjusted_rand_score\n", "import os\n", "\n", "concat_dir = os.path.join(out_dir, \"concat_pca\")\n", "os.makedirs(concat_dir, exist_ok=True)\n", "fig_dir = os.path.join(concat_dir, \"figures_concat\")\n", "os.makedirs(fig_dir, exist_ok=True)\n", "\n", "print(\"[INFO] Inizio integrazione per concatenazione PCA (DEBUG 5×5 MODE)...\")\n", "\n", "# === 1️⃣ RNA e DNA PCA check ===\n", "for mod in [\"rna\", \"dna\"]:\n", " if \"X_pca\" not in mdata[mod].obsm:\n", " raise ValueError(f\"[ERROR] Manca PCA per {mod}! Riesegui sc.tl.pca.\")\n", " \n", " print(f\"\\n[DEBUG] --- {mod.upper()} ---\")\n", " print(f\"Shape (cells × features): {mdata[mod].shape}\")\n", "\n", " # Prime 5×5 celle della matrice normalizzata\n", " X = mdata[mod].X\n", " block_raw = X[:5, :5].A if hasattr(X, \"A\") else np.asarray(X[:5, :5])\n", " print(f\"Prime 5×5 celle {mod} (normalizzate e scalate):\\n{np.round(block_raw, 4)}\")\n", "\n", " # PCA info# PCA info\n", " X_pca = mdata[mod].obsm[\"X_pca\"]\n", " print(f\"PCA shape: {X_pca.shape}, varianza media: {np.var(X_pca):.4f}\")\n", "\n", " # Mostra le prime 5 righe e 5 colonne della PCA\n", " print(\"Prime 5x5 celle della PCA:\")\n", " print(np.round(X_pca[:5, :5], 4))\n", "\n", " # Varianza spiegata (se disponibile)\n", " if \"pca\" in mdata[mod].uns and \"variance_ratio\" in mdata[mod].uns[\"pca\"]:\n", " vr = mdata[mod].uns[\"pca\"][\"variance_ratio\"]\n", " print(f\"PCA variance ratio (prime 5): {np.round(vr[:5], 4)}\")\n", " \n", "# === 2️⃣ Estrai PCA e concatena ===\n", "pca_rna = mdata[\"rna\"].obsm[\"X_pca\"]\n", "pca_dna = mdata[\"dna\"].obsm[\"X_pca\"]\n", "\n", "if pca_rna.shape[0] != pca_dna.shape[0]:\n", " raise ValueError(f\"[ERROR] RNA e DNA hanno celle diverse: RNA={pca_rna.shape[0]} DNA={pca_dna.shape[0]}\")\n", "\n", "print(f\"\\n[INFO] Celle allineate: {pca_rna.shape[0]}\")\n", "X_concat = np.concatenate([pca_rna, pca_dna], axis=1)\n", "mdata.obsm[\"X_concat_pca\"] = X_concat\n", "\n", "print(f\"[INFO] Concatenazione PCA completata: {X_concat.shape}\")\n", "print(f\"[DEBUG] Prime 5×5 celle CONCATENATE:\\n{np.round(X_concat[:5, :5], 4)}\")\n", "\n", "# === 3️⃣ Neighbors, UMAP, Leiden ===\n", "sc.pp.neighbors(mdata, use_rep=\"X_concat_pca\")\n", "conn = mdata.obsp[\"connectivities\"]\n", "dist = mdata.obsp[\"distances\"]\n", "print(f\"[DEBUG] Neighbors graph: shape={conn.shape}, mean_conn={conn.mean():.6f}, nnz={conn.nnz}\")\n", "print(f\"[DEBUG] Neighbors graph: shape={dist.shape}, mean_conn={dist.mean():.6f}, nnz={dist.nnz}\")\n", "\n", "\n", "sc.tl.umap(mdata)\n", "mdata.obsm[\"X_concat_umap\"] = mdata.obsm[\"X_umap\"].copy()\n", "\n", "concat_res_list = [0.5, 1.0, 1.5, 2.0]\n", "print(f\"[INFO] Calcolo Leiden per risoluzioni: {concat_res_list}\")\n", "\n", "for res in concat_res_list:\n", " key = f\"leiden_concat_{res:.2f}\".rstrip(\"0\").rstrip(\".\")\n", " sc.tl.leiden(\n", " mdata,\n", " resolution=res,\n", " key_added=key,\n", " flavor=\"igraph\",\n", " n_iterations=2,\n", " directed=False,\n", " random_state=0, # deterministic\n", " )\n", "\n", " # Debug cluster info\n", " n_clusters = mdata.obs[key].nunique()\n", " sizes = mdata.obs[key].value_counts().sort_index().values\n", " print(f\"[DEBUG] → {key}: {n_clusters} clusters | mean={np.mean(sizes):.1f} | min={np.min(sizes)} | max={np.max(sizes)}\")\n", "\n", " # UMAP plot\n", " fig, ax = plt.subplots(figsize=(6, 5))\n", " sc.pl.embedding(\n", " mdata,\n", " basis=\"X_concat_umap\",\n", " color=key,\n", " title=f\"Concat PCA UMAP — Leiden res={res}\",\n", " frameon=False,\n", " ax=ax,\n", " show=False,\n", " )\n", " out_path = os.path.join(fig_dir, f\"concat_umap_leiden_res{res}.pdf\")\n", " plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n", " plt.close(fig)\n", " print(f\"[INFO] Salvato → {out_path}\")\n", "\n", "# === 4️⃣ ARI (se presente true_label) ===\n", "label_col = None\n", "if \"true_label_col\" in globals() and true_label_col is not None:\n", " label_col = true_label_col\n", "elif \"true_label\" in mdata.obs.columns:\n", " label_col = \"true_label\"\n", "\n", "def _maybe_ari(obs, pred_col, label_col=None):\n", " if label_col is None or label_col not in obs.columns:\n", " return np.nan\n", " valid = obs[[label_col, pred_col]].dropna()\n", " if valid.empty:\n", " return np.nan\n", " return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n", "\n", "results_concat = []\n", "if label_col is not None:\n", " for res in concat_res_list:\n", " col = f\"leiden_concat_{res:.2f}\".rstrip(\"0\").rstrip(\".\")\n", " ari = _maybe_ari(mdata.obs, col, label_col)\n", " results_concat.append({\"method\": \"ConcatPCA\", \"resolution\": res, \"ARI\": float(ari)})\n", " df_concat = pd.DataFrame(results_concat).sort_values(\"resolution\")\n", " print(\"\\n=== RISULTATI ARI CONCAT (DEBUG 5×5) ===\")\n", " print(df_concat.to_string(index=False))\n", " csv_path = os.path.join(concat_dir, \"ARI_ConcatPCA_summary.csv\")\n", " df_concat.to_csv(csv_path, index=False)\n", " print(f\"[INFO] ARI salvati → {csv_path}\")\n", "else:\n", " print(\"[WARN] Nessuna colonna true_label trovata, salto ARI per Concat PCA.\")\n", "\n", "# === 5️⃣ Salvataggio MuData ===\n", "out_path = os.path.join(concat_dir, \"mdata_after_concat.h5mu\")\n", "mdata.write(out_path)\n", "print(f\"[INFO] Salvato MuData → {out_path}\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "05047a95-cc55-4cbb-abad-a336c2074c6e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Likelihood per vista: {'rna': 'gaussian', 'dna': 'gaussian'}\n[INFO] Likelihoods usate in ordine: ['gaussian', 'gaussian']\n[INFO] Avvio MOFA con n_factors=10...\n\n #########################################################\n ### __ __ ____ ______ ### \n ### | \\/ |/ __ \\| ____/\\ _ ### \n ### | \\ / | | | | |__ / \\ _| |_ ### \n ### | |\\/| | | | | __/ /\\ \\_ _| ###\n ### | | | | |__| | | / ____ \\|_| ###\n ### |_| |_|\\____/|_|/_/ \\_\\ ###\n ### ### \n ######################################################### \n \n \n ", "Loaded view='rna' group='group1' with N=5000 samples and D=10000 features...\nLoaded view='dna' group='group1' with N=5000 samples and D=20000 features...\n\n", "Model options:\n- Automatic Relevance Determination prior on the factors: True\n- Automatic Relevance Determination prior on the weights: True\n- Spike-and-slab prior on the factors: False\n- Spike-and-slab prior on the weights: True\nLikelihoods:\n- View 0 (rna): gaussian\n- View 1 (dna): gaussian\n\n", "\n\n######################################\n## Training the model with seed 0 ##\n######################################\n\n", "\nConverged!\n\n\n\n#######################\n## Training finished ##\n#######################\n\n\nSaving model in /tmp/mofa_20251125-223303.hdf5...", "Saved MOFA embeddings in .obsm['X_mofa'] slot and their loadings in .varm['LFs'].\n[INFO] Calcolo neighbors e UMAP su fattori MOFA...", "[INFO] Calcolo Leiden per risoluzioni: [0.5, 1.0, 1.5, 2.0]", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res0.5.pdf", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res1.0.pdf", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res1.5.pdf", "[INFO] Salvato → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/figures_mofa/mofa_umap_leiden_res2.0.pdf" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
methodresolutionARI
0MOFA0.50.569959
1MOFA1.00.569959
2MOFA1.50.569959
3MOFA2.00.420227
\n", "
" ], "text/plain": [ " method resolution ARI\n", "0 MOFA 0.5 0.569959\n", "1 MOFA 1.0 0.569959\n", "2 MOFA 1.5 0.569959\n", "3 MOFA 2.0 0.420227" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[INFO] ARI salvati → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/ARI_MOFA_summary.csv", "[INFO] Salvato MuData finale → ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/mofa/mdata_after_mofa.h5mu" ] } ], "source": [ "# --- BLOCCO 10: Integrazione MOFA(+) + UMAP + Leiden + ARI ---\n", "\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import scanpy as sc\n", "import muon as mu\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import adjusted_rand_score\n", "\n", "mofa_dir = os.path.join(out_dir, \"mofa\")\n", "os.makedirs(mofa_dir, exist_ok=True)\n", "fig_dir = os.path.join(mofa_dir, \"figures_mofa\")\n", "os.makedirs(fig_dir, exist_ok=True)\n", "\n", "# 0️⃣ Verifica che mofapy2 sia installato\n", "try:\n", " import mofapy2 # noqa: F401\n", "except ImportError:\n", " raise ImportError(\"Devi installare mofapy2 prima di usare MOFA → pip install mofapy2\")\n", "\n", "# 1️⃣ Imposta likelihood per ciascuna vista\n", "likelihoods = {}\n", "for view in mdata.mod.keys():\n", " X = mdata.mod[view].X\n", " try:\n", " vals = np.unique(X.data if hasattr(X, \"data\") else np.asarray(X).ravel())\n", " is_binary = np.all(np.isin(vals, [0, 1]))\n", " except Exception:\n", " is_binary = False\n", "\n", " if view.lower() in (\"rna\", \"gex\", \"gene\", \"transcriptome\"):\n", " likelihoods[view] = \"gaussian\"\n", " elif is_binary:\n", " likelihoods[view] = \"bernoulli\"\n", " else:\n", " likelihoods[view] = \"gaussian\"\n", "\n", "print(\"[INFO] Likelihood per vista:\", likelihoods)\n", "\n", "# ➕ Converti in LISTA nell'ordine delle viste in mdata.mod\n", "likelihoods_list = [likelihoods[view] for view in mdata.mod.keys()]\n", "print(\"[INFO] Likelihoods usate in ordine:\", likelihoods_list)\n", "\n", "# 2️⃣ Esegui MOFA\n", "n_factors = 10\n", "print(f\"[INFO] Avvio MOFA con n_factors={n_factors}...\")\n", "mu.tl.mofa(\n", " mdata,\n", " n_factors=n_factors,\n", " likelihoods=likelihoods_list, # ✅ usa lista, non dict\n", " convergence_mode=\"medium\",\n", " seed=0,\n", ")\n", "\n", "# I fattori vengono salvati in mdata.obsm[\"X_mofa\"]\n", "assert \"X_mofa\" in mdata.obsm, \"MOFA non ha prodotto X_mofa; controlla i log di mofapy2\"\n", "\n", "# 3️⃣ Costruisci grafo e UMAP nello spazio MOFA\n", "print(\"[INFO] Calcolo neighbors e UMAP su fattori MOFA...\")\n", "sc.pp.neighbors(mdata, use_rep=\"X_mofa\")\n", "sc.tl.umap(mdata)\n", "mdata.obsm[\"X_mofa_umap\"] = mdata.obsm[\"X_umap\"].copy()\n", "\n", "# 4️⃣ Leiden multi-risoluzione\n", "mofa_res_list = [0.5, 1.0, 1.5, 2.0]\n", "print(f\"[INFO] Calcolo Leiden per risoluzioni: {mofa_res_list}\")\n", "for res in mofa_res_list:\n", " key = f\"leiden_mofa_{res}\"\n", " sc.tl.leiden(\n", " mdata,\n", " resolution=res,\n", " key_added=key,\n", " flavor=\"igraph\",\n", " n_iterations=2,\n", " directed=False,\n", " )\n", "\n", " # Plot\n", " fig, ax = plt.subplots(figsize=(6, 5))\n", " sc.pl.embedding(\n", " mdata,\n", " basis=\"X_mofa_umap\",\n", " color=key,\n", " title=f\"MOFA UMAP — Leiden res={res}\",\n", " frameon=False,\n", " ax=ax,\n", " show=False,\n", " )\n", " out_path = os.path.join(fig_dir, f\"mofa_umap_leiden_res{res}.pdf\")\n", " plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n", " plt.close(fig)\n", " print(f\"[INFO] Salvato → {out_path}\")\n", "\n", "# 5️⃣ Calcolo ARI (se esiste una colonna di verità)\n", "def _maybe_ari(obs, pred_col, label_col=None):\n", " if label_col is None or label_col not in obs.columns:\n", " return np.nan\n", " valid = obs[[label_col, pred_col]].dropna()\n", " if valid.empty:\n", " return np.nan\n", " return adjusted_rand_score(valid[label_col].astype(str), valid[pred_col].astype(str))\n", "\n", "label_col = None\n", "if \"true_label_col\" in globals() and true_label_col is not None:\n", " label_col = true_label_col\n", "elif \"true_label\" in mdata.obs.columns:\n", " label_col = \"true_label\"\n", "\n", "results_mofa = []\n", "if label_col is not None:\n", " for res in mofa_res_list:\n", " col = f\"leiden_mofa_{res}\"\n", " ari = _maybe_ari(mdata.obs, col, label_col)\n", " results_mofa.append({\"method\": \"MOFA\", \"resolution\": res, \"ARI\": float(ari)})\n", " df_mofa = pd.DataFrame(results_mofa).sort_values(\"resolution\")\n", " display(df_mofa)\n", " csv_path = os.path.join(mofa_dir, \"ARI_MOFA_summary.csv\")\n", " df_mofa.to_csv(csv_path, index=False)\n", " print(f\"[INFO] ARI salvati → {csv_path}\")\n", "else:\n", " print(\"[WARN] Nessuna colonna true_label trovata: salto ARI per MOFA.\")\n", "\n", "# 6️⃣ Salvataggio finale del MuData aggiornato\n", "out_path = os.path.join(mofa_dir, \"mdata_after_mofa.h5mu\")\n", "mdata.write(out_path)\n", "print(f\"[INFO] Salvato MuData finale → {out_path}\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "40b9c845-6a75-4167-8d0d-21a5b5b99d04", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 20, "id": "a77cd0f2-4b08-44e2-9422-6fb664613677", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "5710a4fe-6d4d-4894-a5c0-c620160836ff", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "127da0bc-d1c6-4c5a-9e0d-c601b1b9b9be", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f605a6d7-c6e2-48e4-8510-f75266de9660", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9c31cbe4-e48a-4e23-abfc-6a4cff80842b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:scvar_env]", "language": "python", "name": "conda-env-scvar_env-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }