{ "cells": [ { "cell_type": "code", "execution_count": 14, "id": "04c96386-1480-4ab3-a7f3-2aed794832e3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'calcOmicsClusters', 'distributionClusters', 'omicsIntegration', 'pairedIntegrationTrainer', 'save_all_umaps', 'scVAR', 'transcriptomicAnalysis', 'variantAnalysis', 'weightsInit']" ] } ], "source": [ "#Import all dependencies from the scVAR enviroment (see installation instructions)\n", "import importlib\n", "import scVAR\n", "importlib.reload(scVAR)\n", "print(dir(scVAR))\n", "import sys\n", "import pickle\n", "import os\n", "import scanpy as sc\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 15, "id": "be60befd-a7b5-4e3a-84c5-a80aa06ccc4b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start Analysis sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3" ] } ], "source": [ "sample = \"sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3\"\n", "out_path = '../tests/' + sample\n", "in_path = '../tests/' + sample\n", "\n", "# Crea\n", "if not os.path.exists(out_path):\n", " os.makedirs(out_path, exist_ok=True)\n", "\n", "print('Start Analysis', sample)\n", "\n", "# Specify genomic file path\n", "var_mat = in_path + '/' +'consensus_filtered_markdup.mtx'\n", "barcode_var = in_path + '/' + 'barcodes_var.tsv'\n", "snv = in_path + '/' +'variants_filtered_markdup.txt'" ] }, { "cell_type": "code", "execution_count": 16, "id": "93db82d6-fd94-4494-86a1-844b90d05287", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] === TRANSCRIPTOMIC ANALYSIS START ===[DEBUG] Raw RNA matrix shape: (5000, 10000)\n", "[DEBUG] Prime 5×5 celle RAW (counts):\n", "[[1.54455 0.00000 3.06500 1.84460 2.34581]\n", " [0.00000 1.48369 0.00000 0.00000 1.62663]\n", " [1.31263 2.22300 4.41671 4.54242 0.00000]\n", " [5.87613 5.61039 4.08110 0.00000 4.16244]\n", " [2.29248 1.85998 0.00000 0.00000 2.09903]]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/scanpy/preprocessing/_simple.py:392: RuntimeWarning: invalid value encountered in log1p\n", " np.log1p(X, out=X)\n", "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/functools.py:889: UserWarning: zero-centering a sparse array/matrix densifies it.\n", " return dispatch(args[0].__class__)(*args, **kw)" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[DEBUG] RNA normalizzata e scalata (Muon-style). Shape: (5000, 10000)\n", "[DEBUG] Prime 5×5 celle RNA (normalizzate):\n", "[[-0.1368 -1.3475 0.5900 -0.0227 0.3918]\n", " [-1.2973 0.0203 -1.5114 -1.4452 0.0200]\n", " [-0.2579 0.4090 1.0154 0.8799 -1.4613]\n", " [ 1.1003 1.4959 0.9280 -1.4452 1.0607]\n", " [ 0.1765 0.2256 -1.5114 -1.4452 0.2665]]\n", "[INFO] PCA con 50 componenti..." ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:328: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.\n", " warnings.warn(" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Salvato adata.uns['trans_X'] con shape (5000, 50)\n", "[DEBUG] Prime 5×5 celle PCA:\n", "[[ 27.5802 12.5513 -13.206 -0.4545 2.8701]\n", " [ 28.8187 10.3438 -13.1964 -1.2082 -4.3537]\n", " [ -1.777 2.2861 12.3413 30.4629 -2.2863]\n", " [ -6.7155 -26.169 -18.2502 -0.6037 0.9084]\n", " [ 28.8888 12.6758 -13.6591 -1.4595 -4.2337]]\n", "[DEBUG] PCA variance ratio (prime 5): [0.0285 0.0276 0.0272 0.0257 0.0005]\n", "[INFO] === TRANSCRIPTOMIC ANALYSIS DONE (5000 cells, 10000 genes) ===\n", "[INFO] === VARIANT ANALYSIS START ===\n", "[INFO] Lettura file: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/consensus_filtered_markdup.mtx\n", "[WARN] Transposing variant matrix (20000, 5000) → expected (cells × variants)\n", "[INFO] Matrice varianti caricata → 5000 celle × 20000 varianti\n", "[DEBUG] Prime 5×5 celle RAW (counts):\n", "[[0 0 1 0 0]\n", " [0 0 0 0 0]\n", " [1 0 1 0 0]\n", " [0 1 0 0 0]\n", " [0 0 1 0 0]][INFO] Celle comuni con RNA: 5000\n", "[INFO] Rappresentazione scelta: MUON\n", "[INFO] Uso pipeline Muon-style (binarizzazione + scaling + PCA + UMAP)[INFO] Varianti mantenute: 20000/20000" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/functools.py:889: UserWarning: zero-centering a sparse array/matrix densifies it.\n", " return dispatch(args[0].__class__)(*args, **kw)" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[DEBUG] Prime 5×5 celle DNA (scalate):\n", "[[-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]\n", " [-0.4074 -0.3807 -0.5594 -0.3896 -0.395 ]\n", " [ 2.4538 -0.3807 1.7872 -0.3896 -0.395 ]\n", " [-0.4074 2.6263 -0.5594 -0.3896 -0.395 ]\n", " [-0.4074 -0.3807 1.7872 -0.3896 -0.395 ]][DEBUG] PCA variance ratio (prime 5): [0.0513 0.0347 0.0315 0.0004 0.0004]\n", "[DEBUG] Prime 5×5 celle PCA:\n", "[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n", " [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n", " [-21.7155 2.5589 52.835 3.8058 0.8215]\n", " [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n", " [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\n", "[INFO] DNA PCA shape: (5000, 30)\n", "[INFO] === VARIANT ANALYSIS DONE (5000 cells, 20000 variants) ===\n", "[INFO] === OMICS INTEGRATION START ===\n", "[INFO] Nessuna riscalatura: uso PCA pura (Muon-style).\n", "[INFO] n_cells=5000 ≥ 2000 → uso Autoencoder asimmetrico (RNA=teacher, VAR=student)[EPOCH 000] loss=12.20041[EPOCH 010] loss=9.84364[EPOCH 020] loss=9.29281[EPOCH 030] loss=8.89979[EPOCH 040] loss=8.66576[EPOCH 050] loss=8.47935[EPOCH 060] loss=8.37573[EPOCH 070] loss=8.21203[EPOCH 080] loss=8.10949[EPOCH 090] loss=8.02636[EPOCH 100] loss=7.95381[EPOCH 110] loss=7.89721[EPOCH 120] loss=7.86011[EPOCH 130] loss=7.75689[EPOCH 140] loss=7.68363[EPOCH 150] loss=7.67342[EPOCH 160] loss=7.59935[EPOCH 170] loss=7.59624[EPOCH 180] loss=7.59375[EPOCH 190] loss=7.53767[EPOCH 200] loss=7.50628[EPOCH 210] loss=7.48779[EPOCH 220] loss=7.46081[EPOCH 230] loss=7.45913[EPOCH 240] loss=7.40112[EPOCH 250] loss=7.40735[EPOCH 260] loss=7.36739[EPOCH 270] loss=7.37908[EPOCH 280] loss=7.33565[EPOCH 290] loss=7.33232[EPOCH 300] loss=7.29641[EPOCH 310] loss=7.29751[EPOCH 320] loss=7.27481[EPOCH 330] loss=7.31685[EPOCH 340] loss=7.28791[EPOCH 350] loss=7.25487[EARLY STOP] epoch=353, loss=7.25588\n", "\n", "[DEBUG] === AE DIAGNOSTIC ===\n", "Shape zA: (5000, 400), zB: (5000, 400)\n", "Varianza zA: 0.003693\n", "Varianza zB: 0.000615\n", "Covarianza media zA/zB: 0.000003\n", "SimAE (dot): 35.875084\n", "Media correlazione assoluta tra feature zA: 0.0882\n", "Range valori zA: -1.6930 → 2.9349\n", "Range valori zB: -1.6362 → 2.8093\n", "[DEBUG] Prime 5x5 celle zA:\n", " [[ 0.0551 -0.0612 -0.0695 -0.0238 -0.0324]\n", " [-0.0339 -0.0796 -0.0597 0.0498 0.039 ]\n", " [-0.0385 0.0456 0.0597 0.0698 0.0962]\n", " [-0.0265 0.0119 0.0295 0.0565 0.1366]\n", " [ 0.0226 0.0446 0.0062 0.0481 -0.0308]]\n", "[DEBUG] Prime 5x5 celle zB:\n", " [[-0.0421 -0.0021 0.0417 0.0299 0.0093]\n", " [-0.0069 -0.0332 0.0173 0.0387 0.0122]\n", " [-0.0257 0.0129 0.0199 -0.0048 0.0483]\n", " [-0.0359 -0.0138 0.0808 0.0518 -0.0027]\n", " [-0.0778 -0.0142 0.058 0.0141 -0.0141]]\n", "================================\n", "\n", "[DEBUG] Varianza media zA: 0.0921, zB: 0.0869, zAE: 0.0884\n", "[DEBUG] Prime 5×5 celle embedding AE:\n", "[[ 0.0065 -0.0316 -0.0139 0.0031 -0.0115]\n", " [-0.0204 -0.0564 -0.0212 0.0442 0.0256]\n", " [-0.0321 0.0293 0.0398 0.0325 0.0722]\n", " [-0.0312 -0.001 0.0551 0.0541 0.067 ]\n", " [-0.0276 0.0152 0.0321 0.0311 -0.0224]]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n", " warn(" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[INFO] AE similarity=35.875\n", "[INFO] === AUTOENCODER INTEGRATION COMPLETATA ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n", " [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n", " [-21.7155 2.5589 52.835 3.8058 0.8215]\n", " [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n", " [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n", "[DEBUG] Varianza media embedding: 85.4580\n", "[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n", "[INFO] → Leiden completato su variant (res=0.5) → 4 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 27.5802 12.5513 -13.206 -0.4545 2.8701]\n", " [ 28.8187 10.3438 -13.1964 -1.2082 -4.3537]\n", " [ -1.777 2.2861 12.3413 30.4629 -2.2863]\n", " [ -6.7155 -26.169 -18.2502 -0.6037 0.9084]\n", " [ 28.8888 12.6758 -13.6591 -1.4595 -4.2337]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n", "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n", "[INFO] → Leiden completato su trans (res=0.5) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (int) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 0.0065 -0.0316 -0.0139 0.0031 -0.0115]\n", " [-0.0204 -0.0564 -0.0212 0.0442 0.0256]\n", " [-0.0321 0.0293 0.0398 0.0325 0.0722]\n", " [-0.0312 -0.001 0.0551 0.0541 0.067 ]\n", " [-0.0276 0.0152 0.0321 0.0311 -0.0224]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n", "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n", "[INFO] → Leiden completato su int (res=0.5) → 4 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n", " [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n", " [-21.7155 2.5589 52.835 3.8058 0.8215]\n", " [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n", " [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n", "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n", "[INFO] → Leiden completato su variant (res=1.0) → 4 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 27.5802 12.5513 -13.206 -0.4545 2.8701]\n", " [ 28.8187 10.3438 -13.1964 -1.2082 -4.3537]\n", " [ -1.777 2.2861 12.3413 30.4629 -2.2863]\n", " [ -6.7155 -26.169 -18.2502 -0.6037 0.9084]\n", " [ 28.8888 12.6758 -13.6591 -1.4595 -4.2337]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n", "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n", "[INFO] → Leiden completato su trans (res=1.0) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (int) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 0.0065 -0.0316 -0.0139 0.0031 -0.0115]\n", " [-0.0204 -0.0564 -0.0212 0.0442 0.0256]\n", " [-0.0321 0.0293 0.0398 0.0325 0.0722]\n", " [-0.0312 -0.001 0.0551 0.0541 0.067 ]\n", " [-0.0276 0.0152 0.0321 0.0311 -0.0224]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n", "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n", "[INFO] → Leiden completato su int (res=1.0) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n", " [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n", " [-21.7155 2.5589 52.835 3.8058 0.8215]\n", " [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n", " [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n", "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n", "[INFO] → Leiden completato su variant (res=1.5) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 27.5802 12.5513 -13.206 -0.4545 2.8701]\n", " [ 28.8187 10.3438 -13.1964 -1.2082 -4.3537]\n", " [ -1.777 2.2861 12.3413 30.4629 -2.2863]\n", " [ -6.7155 -26.169 -18.2502 -0.6037 0.9084]\n", " [ 28.8888 12.6758 -13.6591 -1.4595 -4.2337]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n", "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n", "[INFO] → Leiden completato su trans (res=1.5) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (int) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 0.0065 -0.0316 -0.0139 0.0031 -0.0115]\n", " [-0.0204 -0.0564 -0.0212 0.0442 0.0256]\n", " [-0.0321 0.0293 0.0398 0.0325 0.0722]\n", " [-0.0312 -0.001 0.0551 0.0541 0.067 ]\n", " [-0.0276 0.0152 0.0321 0.0311 -0.0224]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n", "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n", "[INFO] → Leiden completato su int (res=1.5) → 12 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[-23.7148 2.554 54.0891 -0.8168 0.8134]\n", " [-32.1915 -40.0711 -16.1456 2.0053 2.1533]\n", " [-21.7155 2.5589 52.835 3.8058 0.8215]\n", " [-22.7593 2.0298 53.5315 -0.1129 -0.8372]\n", " [-31.3252 -38.9129 -18.8992 -0.4091 -0.2577]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n", "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n", "[INFO] → Leiden completato su variant (res=2.0) → 12 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 27.5802 12.5513 -13.206 -0.4545 2.8701]\n", " [ 28.8187 10.3438 -13.1964 -1.2082 -4.3537]\n", " [ -1.777 2.2861 12.3413 30.4629 -2.2863]\n", " [ -6.7155 -26.169 -18.2502 -0.6037 0.9084]\n", " [ 28.8888 12.6758 -13.6591 -1.4595 -4.2337]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n", "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n", "[INFO] → Leiden completato su trans (res=2.0) → 5 cluster\n", "[INFO] === CLUSTERING DONE ===\n", "\n", "[INFO] === CALC OMiCS CLUSTERS (int) ===\n", "[INFO] Numero celle: 5000\n", "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n", "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n", "[DEBUG] Prime 5×5 celle embedding:\n", "[[ 0.0065 -0.0316 -0.0139 0.0031 -0.0115]\n", " [-0.0204 -0.0564 -0.0212 0.0442 0.0256]\n", " [-0.0321 0.0293 0.0398 0.0325 0.0722]\n", " [-0.0312 -0.001 0.0551 0.0541 0.067 ]\n", " [-0.0276 0.0152 0.0321 0.0311 -0.0224]]\n", "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n", "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n", "[INFO] → Leiden completato su int (res=2.0) → 21 cluster\n", "[INFO] === CLUSTERING DONE ===" ] } ], "source": [ "import numpy as np\n", "\n", "# RNA - OK\n", "adata = scVAR.transcriptomicAnalysis(\n", " path_10x=in_path,\n", " bcode_variants=barcode_var,\n", " n_pcs=50, # oppure None se vuoi usare tutto (più lento/rumoroso)\n", ")\n", "\n", "# VAR - OK\n", "adata = scVAR.variantAnalysis(adata, matrix_path=var_mat, bcode_path=barcode_var,\n", " variants_path=snv, n_pcs=30, variant_filter_level=\"medium\",variant_rep=\"muon\")\n", "\n", "\n", "adata = scVAR.omicsIntegration(\n", " adata,\n", " latent_dim=400,\n", " num_epoch=3500,\n", " lam_align=0.5,\n", " lam_cross=7.7,\n", " lam_recon_a=1.0,\n", " lam_recon_b=0.8,\n", ")\n", "\n", "# Compute transcriptomics, genomics and integrated clusters at different resolutions\n", "res_list = [0.5, 1.0, 1.5, 2.0]\n", "for res in res_list:\n", " adata = scVAR.calcOmicsClusters(adata, omic_key='variant', res=res)\n", " adata = scVAR.calcOmicsClusters(adata, omic_key='trans', res=res)\n", " adata = scVAR.calcOmicsClusters(adata, omic_key='int', res=res)" ] }, { "cell_type": "code", "execution_count": 17, "id": "4e52f021-0ca9-444a-a7b8-df7cd1fcb13d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AnnData object with n_obs × n_vars = 5000 × 10000\n", " obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2'\n", " var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'\n", " uns: 'trans_raw', 'trans_raw_obs_names', 'trans_raw_var_names', 'log1p', 'pca', 'neighbors', 'umap', 'trans_X', 'variant_raw', 'variant_raw_obs_names', 'variant_raw_var_names', 'variant_X', 'int_X', 'int_metrics', 'variant_neighbors', 'variant_clust_0.5', 'trans_neighbors', 'trans_clust_0.5', 'int_neighbors', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2'\n", " obsm: 'X_pca', 'X_umap', 'trans_pca', 'trans_umap', 'variant_pca', 'variant_umap', 'int_umap', 'variant_X', 'trans_X', 'int_X'\n", " varm: 'PCs'\n", " obsp: 'distances', 'connectivities', 'variant_neighbors_distances', 'variant_neighbors_connectivities', 'trans_neighbors_distances', 'trans_neighbors_connectivities', 'int_neighbors_distances', 'int_neighbors_connectivities'\n", "Index(['n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt',\n", " 'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5',\n", " 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1',\n", " 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5',\n", " 'variant_clust_2', 'trans_clust_2', 'int_clust_2'],\n", " dtype='object')" ] } ], "source": [ "print(adata)\n", "print(adata.obs.columns)" ] }, { "cell_type": "code", "execution_count": 18, "id": "746bbe89-e96f-46e3-b35b-69b89951d8cd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2']\n", "variant_clust_0.5 4\n", "trans_clust_0.5 5\n", "int_clust_0.5 4\n", "variant_clust_1 4\n", "trans_clust_1 5\n", "int_clust_1 5\n", "variant_clust_1.5 5\n", "trans_clust_1.5 5\n", "int_clust_1.5 12\n", "variant_clust_2 12\n", "trans_clust_2 5\n", "int_clust_2 21\n", "Name: n_cluster, dtype: int64" ] } ], "source": [ "print(adata.obs.columns.tolist())\n", "import pandas as pd\n", "\n", "cluster_cols = [c for c in adata.obs.columns if \"clust\" in c]\n", "cluster_summary = {\n", " c: adata.obs[c].nunique(dropna=True)\n", " for c in cluster_cols\n", "}\n", "\n", "print(pd.Series(cluster_summary, name=\"n_cluster\"))" ] }, { "cell_type": "code", "execution_count": 19, "id": "757bed63-e59b-4cb4-a614-d099a59f8dad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== 🔍 ISPEZIONE DETTAGLIATA DI adata.uns ===\n", " • trans_raw: shape=(5000, 10000), dtype=float64\n", " • trans_raw_obs_names: shape=(5000,), dtype=object\n", " • trans_raw_var_names: shape=(10000,), dtype=object\n", " • log1p: type=dict, len=1\n", " • pca: type=dict, len=3\n", " • neighbors: type=dict, len=3\n", " • umap: type=dict, len=1\n", " • trans_X: shape=(5000, 50), dtype=float32\n", " • variant_raw: shape=(5000, 20000), dtype=int64\n", " • variant_raw_obs_names: shape=(5000,), dtype=object\n", " • variant_raw_var_names: shape=(20000,), dtype=object\n", " • variant_X: shape=(5000, 30), dtype=float32\n", " • int_X: shape=(5000, 400), dtype=float32\n", " • int_metrics: type=dict, len=7\n", " • variant_neighbors: type=dict, len=3\n", " • variant_clust_0.5: type=dict, len=2\n", " • trans_neighbors: type=dict, len=3\n", " • trans_clust_0.5: type=dict, len=2\n", " • int_neighbors: type=dict, len=3\n", " • int_clust_0.5: type=dict, len=2\n", " • variant_clust_1: type=dict, len=2\n", " • trans_clust_1: type=dict, len=2\n", " • int_clust_1: type=dict, len=2\n", " • variant_clust_1.5: type=dict, len=2\n", " • trans_clust_1.5: type=dict, len=2\n", " • int_clust_1.5: type=dict, len=2\n", " • variant_clust_2: type=dict, len=2\n", " • trans_clust_2: type=dict, len=2\n", " • int_clust_2: type=dict, len=2" ] } ], "source": [ "print(\"=== 🔍 ISPEZIONE DETTAGLIATA DI adata.uns ===\")\n", "\n", "if hasattr(adata, \"uns\") and len(adata.uns.keys()) > 0:\n", " for key in adata.uns.keys():\n", " obj = adata.uns[key]\n", " # prova a estrarre shape e dtype se è array-like o matrice\n", " shape = getattr(obj, \"shape\", None)\n", " dtype = getattr(obj, \"dtype\", type(obj))\n", " if shape is not None:\n", " print(f\" • {key}: shape={shape}, dtype={dtype}\")\n", " else:\n", " t = type(obj).__name__\n", " size = len(obj) if hasattr(obj, \"__len__\") else \"-\"\n", " print(f\" • {key}: type={t}, len={size}\")\n", "else:\n", " print(\"⚠️ Nessun elemento trovato in adata.uns.\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "a8627c7e-1641-483a-817e-7836fc039d83", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Carico metadati da: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv[INFO] Create colonne combinate da 'TrueCellType' + 'TrueGenotype'\n", "[INFO] Colonna 'TrueCombo' aggiunta ad adata.obs (5000 celle, 0 mancanti)" ] } ], "source": [ "# === BLOCCO RICARICA GROUND TRUTH (per ARI/NMI) ===\n", "import pandas as pd\n", "import os\n", "\n", "meta_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\"\n", "\n", "if not os.path.exists(meta_csv):\n", " raise FileNotFoundError(f\"[ERRORE] File metadati non trovato: {meta_csv}\")\n", "\n", "print(f\"[INFO] Carico metadati da: {meta_csv}\")\n", "meta = pd.read_csv(meta_csv)\n", "\n", "# --- Identifica colonna delle celle ---\n", "if \"Cell\" in meta.columns:\n", " meta = meta.set_index(\"Cell\")\n", "elif \"cell_id\" in meta.columns:\n", " meta = meta.set_index(\"cell_id\")\n", "else:\n", " raise ValueError(f\"[ERRORE] Nessuna colonna 'Cell' o 'cell_id' trovata in {meta.columns.tolist()}\")\n", "\n", "# --- Identifica colonne di tipo e genotipo ---\n", "celltype_col = next((c for c in [\"TrueCellType\", \"CellType\", \"cell_type\"] if c in meta.columns), None)\n", "genotype_col = next((c for c in [\"TrueGenotype\", \"Genotype\", \"genotype\"] if c in meta.columns), None)\n", "\n", "if celltype_col is None and genotype_col is None:\n", " raise ValueError(f\"[ERRORE] Nessuna colonna di tipo cellula o genotipo trovata in {meta.columns.tolist()}\")\n", "\n", "# --- Crea la colonna combinata TrueCombo ---\n", "if celltype_col and genotype_col:\n", " meta[\"TrueCombo\"] = meta[celltype_col].astype(str) + \"_\" + meta[genotype_col].astype(str)\n", " print(f\"[INFO] Create colonne combinate da '{celltype_col}' + '{genotype_col}'\")\n", "elif celltype_col:\n", " meta[\"TrueCombo\"] = meta[celltype_col].astype(str)\n", " print(f\"[INFO] Usata sola colonna cell_type '{celltype_col}' come TrueCombo\")\n", "elif genotype_col:\n", " meta[\"TrueCombo\"] = meta[genotype_col].astype(str)\n", " print(f\"[INFO] Usata sola colonna genotype '{genotype_col}' come TrueCombo\")\n", "\n", "# --- Unisci con adata.obs ---\n", "adata.obs = adata.obs.join(meta[[\"TrueCombo\"]], how=\"left\")\n", "\n", "# --- Controlli ---\n", "if \"TrueCombo\" not in adata.obs.columns:\n", " raise ValueError(\"[ERRORE] Non è stata creata correttamente la colonna TrueCombo.\")\n", "n_missing = adata.obs[\"TrueCombo\"].isna().sum()\n", "print(f\"[INFO] Colonna 'TrueCombo' aggiunta ad adata.obs ({len(adata.obs)} celle, {n_missing} mancanti)\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "baf95f33-2fe8-4766-a855-37959d1163e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INFO] Calcolo ARI/NMI Muon-style per le colonne int_clust_*\n", "[INFO] Risoluzioni trovate (ordinate): [0.5, 1.0, 1.5, 2.0]\n", "[INFO] res=0.5 (int_clust_0.5) → ARI=0.414944 | NMI=0.473148\n", "[INFO] res=1.0 (int_clust_1) → ARI=0.532132 | NMI=0.562063[INFO] res=1.5 (int_clust_1.5) → ARI=0.535991 | NMI=0.555448\n", "[INFO] res=2.0 (int_clust_2) → ARI=0.297338 | NMI=0.428204\n", "\n", "=== RISULTATI FINALI (Muon-style) ===\n", " method resolution colname ARI_MuonCombo NMI_MuonCombo\n", "ConcatPCA 0.5 int_clust_0.5 0.414944 0.473148\n", "ConcatPCA 1.0 int_clust_1 0.532132 0.562063\n", "ConcatPCA 1.5 int_clust_1.5 0.535991 0.555448\n", "ConcatPCA 2.0 int_clust_2 0.297338 0.428204" ] } ], "source": [ "# === BLOCCO UNICO: ARI MUON-STYLE ADATTATO A SCVAR (fix colnames) ===\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n", "\n", "print(\"[INFO] Calcolo ARI/NMI Muon-style per le colonne int_clust_*\")\n", "\n", "# --- Trova tutte le colonne int_clust_* ---\n", "int_cols = [c for c in adata.obs.columns if c.startswith(\"int_clust_\")]\n", "if not int_cols:\n", " raise ValueError(\"Nessuna colonna int_clust_* trovata in adata.obs!\")\n", "\n", "# --- Verifica presenza colonna TrueCombo ---\n", "if \"TrueCombo\" not in adata.obs.columns:\n", " raise ValueError(\"Colonna 'TrueCombo' mancante in adata.obs — impossibile calcolare ARI/NMI.\")\n", "\n", "# --- Costruisci mappa colonna -> risoluzione (float), evitando startswith ---\n", "pairs = []\n", "for c in int_cols:\n", " suf = c.replace(\"int_clust_\", \"\", 1)\n", " try:\n", " res = float(suf)\n", " except ValueError:\n", " # gestisce casi tipo int_clust_XYZ: skip con warning\n", " print(f\"[WARN] Suffix non numerico per colonna {c} — salto.\")\n", " continue\n", " pairs.append((res, c))\n", "\n", "if not pairs:\n", " raise ValueError(\"Nessuna colonna int_clust_* con suffisso numerico valida.\")\n", "\n", "# Ordina per risoluzione (e poi per nome colonna per stabilità)\n", "pairs = sorted(pairs, key=lambda x: (x[0], x[1]))\n", "res_values = [p[0] for p in pairs]\n", "print(f\"[INFO] Risoluzioni trovate (ordinate): {res_values}\")\n", "\n", "results_concat = []\n", "\n", "# --- Loop diretto su (res, colname) già accoppiati correttamente ---\n", "for res, colname in pairs:\n", " if colname not in adata.obs.columns:\n", " print(f\"[WARN] Colonna {colname} non presente in adata.obs — salto.\")\n", " continue\n", "\n", " valid = adata.obs.dropna(subset=[colname, \"TrueCombo\"])\n", " if valid.empty:\n", " print(f\"[WARN] Nessun dato valido per {colname}.\")\n", " continue\n", "\n", " # Calcolo ARI e NMI rispetto a TrueCombo\n", " y_true = valid[\"TrueCombo\"].astype(str)\n", " y_pred = valid[colname].astype(str)\n", " ari = adjusted_rand_score(y_true, y_pred)\n", " nmi = normalized_mutual_info_score(y_true, y_pred)\n", "\n", " results_concat.append({\n", " \"method\": \"ConcatPCA\",\n", " \"resolution\": float(res),\n", " \"colname\": colname,\n", " \"ARI_MuonCombo\": float(ari),\n", " \"NMI_MuonCombo\": float(nmi),\n", " })\n", " print(f\"[INFO] res={res} ({colname}) → ARI={ari:.6f} | NMI={nmi:.6f}\")\n", "\n", "# --- Output finale ---\n", "if results_concat:\n", " df_concat = pd.DataFrame(results_concat).sort_values([\"resolution\", \"colname\"]).reset_index(drop=True)\n", " print(\"\\n=== RISULTATI FINALI (Muon-style) ===\")\n", " print(df_concat.to_string(index=False))\n", "else:\n", " print(\"[WARN] Nessun risultato ARI calcolato.\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:scvar_env]", "language": "python", "name": "conda-env-scvar_env-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }