{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "04c96386-1480-4ab3-a7f3-2aed794832e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'calcOmicsClusters', 'distributionClusters', 'omicsIntegration', 'pairedIntegrationTrainer', 'save_all_umaps', 'scVAR', 'transcriptomicAnalysis', 'variantAnalysis', 'weightsInit']"
     ]
    }
   ],
   "source": [
    "#Import all dependencies from the scVAR enviroment (see installation instructions)\n",
    "import importlib\n",
    "import scVAR\n",
    "importlib.reload(scVAR)\n",
    "print(dir(scVAR))\n",
    "import sys\n",
    "import pickle\n",
    "import os\n",
    "import scanpy as sc\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "be60befd-a7b5-4e3a-84c5-a80aa06ccc4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start Analysis sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3"
     ]
    }
   ],
   "source": [
    "sample = \"sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3\"\n",
    "out_path = '../tests/' + sample\n",
    "in_path = '../tests/' + sample\n",
    "\n",
    "# Crea\n",
    "if not os.path.exists(out_path):\n",
    "    os.makedirs(out_path, exist_ok=True)\n",
    "\n",
    "print('Start Analysis', sample)\n",
    "\n",
    "# Specify genomic file path\n",
    "var_mat = in_path + '/' +'consensus_filtered_markdup.mtx'\n",
    "barcode_var = in_path + '/' + 'barcodes_var.tsv'\n",
    "snv = in_path + '/' +'variants_filtered_markdup.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "93db82d6-fd94-4494-86a1-844b90d05287",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] === TRANSCRIPTOMIC ANALYSIS START ===[DEBUG] Raw RNA matrix shape: (5000, 10000)\n",
      "[DEBUG] Prime 5×5 celle RAW (counts):\n",
      "[[1.54455 0.00000 3.06500 1.84460 2.34581]\n",
      " [0.00000 1.48369 0.00000 0.00000 1.62663]\n",
      " [1.31263 2.22300 4.41671 4.54242 0.00000]\n",
      " [5.87613 5.61039 4.08110 0.00000 4.16244]\n",
      " [2.29248 1.85998 0.00000 0.00000 2.09903]]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/scanpy/preprocessing/_simple.py:392: RuntimeWarning: invalid value encountered in log1p\n",
      "  np.log1p(X, out=X)\n",
      "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/functools.py:889: UserWarning: zero-centering a sparse array/matrix densifies it.\n",
      "  return dispatch(args[0].__class__)(*args, **kw)"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[DEBUG] RNA normalizzata e scalata (Muon-style). Shape: (5000, 10000)\n",
      "[DEBUG] Prime 5×5 celle RNA (normalizzate):\n",
      "[[-0.1368 -1.3475  0.5900 -0.0227  0.3918]\n",
      " [-1.2973  0.0203 -1.5114 -1.4452  0.0200]\n",
      " [-0.2579  0.4090  1.0154  0.8799 -1.4613]\n",
      " [ 1.1003  1.4959  0.9280 -1.4452  1.0607]\n",
      " [ 0.1765  0.2256 -1.5114 -1.4452  0.2665]]\n",
      "[INFO] PCA con 50 componenti..."
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:328: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.\n",
      "  warnings.warn("
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Salvato adata.uns['trans_X'] con shape (5000, 50)\n",
      "[DEBUG] Prime 5×5 celle PCA:\n",
      "[[ 27.5802  12.5513 -13.206   -0.4545   2.8701]\n",
      " [ 28.8187  10.3438 -13.1964  -1.2082  -4.3537]\n",
      " [ -1.777    2.2861  12.3413  30.4629  -2.2863]\n",
      " [ -6.7155 -26.169  -18.2502  -0.6037   0.9084]\n",
      " [ 28.8888  12.6758 -13.6591  -1.4595  -4.2337]]\n",
      "[DEBUG] PCA variance ratio (prime 5): [0.0285 0.0276 0.0272 0.0257 0.0005]\n",
      "[INFO] === TRANSCRIPTOMIC ANALYSIS DONE (5000 cells, 10000 genes) ===\n",
      "[INFO] === VARIANT ANALYSIS START ===\n",
      "[INFO] Lettura file: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/consensus_filtered_markdup.mtx\n",
      "[WARN] Transposing variant matrix (20000, 5000) → expected (cells × variants)\n",
      "[INFO] Matrice varianti caricata → 5000 celle × 20000 varianti\n",
      "[DEBUG] Prime 5×5 celle RAW (counts):\n",
      "[[0 0 1 0 0]\n",
      " [0 0 0 0 0]\n",
      " [1 0 1 0 0]\n",
      " [0 1 0 0 0]\n",
      " [0 0 1 0 0]][INFO] Celle comuni con RNA: 5000\n",
      "[INFO] Rappresentazione scelta: MUON\n",
      "[INFO] Uso pipeline Muon-style (binarizzazione + scaling + PCA + UMAP)[INFO] Varianti mantenute: 20000/20000"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/functools.py:889: UserWarning: zero-centering a sparse array/matrix densifies it.\n",
      "  return dispatch(args[0].__class__)(*args, **kw)"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[DEBUG] Prime 5×5 celle DNA (scalate):\n",
      "[[-0.4074 -0.3807  1.7872 -0.3896 -0.395 ]\n",
      " [-0.4074 -0.3807 -0.5594 -0.3896 -0.395 ]\n",
      " [ 2.4538 -0.3807  1.7872 -0.3896 -0.395 ]\n",
      " [-0.4074  2.6263 -0.5594 -0.3896 -0.395 ]\n",
      " [-0.4074 -0.3807  1.7872 -0.3896 -0.395 ]][DEBUG] PCA variance ratio (prime 5): [0.0513 0.0347 0.0315 0.0004 0.0004]\n",
      "[DEBUG] Prime 5×5 celle PCA:\n",
      "[[-23.7148   2.554   54.0891  -0.8168   0.8134]\n",
      " [-32.1915 -40.0711 -16.1456   2.0053   2.1533]\n",
      " [-21.7155   2.5589  52.835    3.8058   0.8215]\n",
      " [-22.7593   2.0298  53.5315  -0.1129  -0.8372]\n",
      " [-31.3252 -38.9129 -18.8992  -0.4091  -0.2577]]\n",
      "[INFO] DNA PCA shape: (5000, 30)\n",
      "[INFO] === VARIANT ANALYSIS DONE (5000 cells, 20000 variants) ===\n",
      "[INFO] === OMICS INTEGRATION START ===\n",
      "[INFO] Nessuna riscalatura: uso PCA pura (Muon-style).\n",
      "[INFO] n_cells=5000 ≥ 2000 → uso Autoencoder asimmetrico (RNA=teacher, VAR=student)[EPOCH 000] loss=12.20041[EPOCH 010] loss=9.84364[EPOCH 020] loss=9.29281[EPOCH 030] loss=8.89979[EPOCH 040] loss=8.66576[EPOCH 050] loss=8.47935[EPOCH 060] loss=8.37573[EPOCH 070] loss=8.21203[EPOCH 080] loss=8.10949[EPOCH 090] loss=8.02636[EPOCH 100] loss=7.95381[EPOCH 110] loss=7.89721[EPOCH 120] loss=7.86011[EPOCH 130] loss=7.75689[EPOCH 140] loss=7.68363[EPOCH 150] loss=7.67342[EPOCH 160] loss=7.59935[EPOCH 170] loss=7.59624[EPOCH 180] loss=7.59375[EPOCH 190] loss=7.53767[EPOCH 200] loss=7.50628[EPOCH 210] loss=7.48779[EPOCH 220] loss=7.46081[EPOCH 230] loss=7.45913[EPOCH 240] loss=7.40112[EPOCH 250] loss=7.40735[EPOCH 260] loss=7.36739[EPOCH 270] loss=7.37908[EPOCH 280] loss=7.33565[EPOCH 290] loss=7.33232[EPOCH 300] loss=7.29641[EPOCH 310] loss=7.29751[EPOCH 320] loss=7.27481[EPOCH 330] loss=7.31685[EPOCH 340] loss=7.28791[EPOCH 350] loss=7.25487[EARLY STOP] epoch=353, loss=7.25588\n",
      "\n",
      "[DEBUG] === AE DIAGNOSTIC ===\n",
      "Shape zA: (5000, 400), zB: (5000, 400)\n",
      "Varianza zA: 0.003693\n",
      "Varianza zB: 0.000615\n",
      "Covarianza media zA/zB: 0.000003\n",
      "SimAE (dot): 35.875084\n",
      "Media correlazione assoluta tra feature zA: 0.0882\n",
      "Range valori zA: -1.6930 → 2.9349\n",
      "Range valori zB: -1.6362 → 2.8093\n",
      "[DEBUG] Prime 5x5 celle zA:\n",
      " [[ 0.0551 -0.0612 -0.0695 -0.0238 -0.0324]\n",
      " [-0.0339 -0.0796 -0.0597  0.0498  0.039 ]\n",
      " [-0.0385  0.0456  0.0597  0.0698  0.0962]\n",
      " [-0.0265  0.0119  0.0295  0.0565  0.1366]\n",
      " [ 0.0226  0.0446  0.0062  0.0481 -0.0308]]\n",
      "[DEBUG] Prime 5x5 celle zB:\n",
      " [[-0.0421 -0.0021  0.0417  0.0299  0.0093]\n",
      " [-0.0069 -0.0332  0.0173  0.0387  0.0122]\n",
      " [-0.0257  0.0129  0.0199 -0.0048  0.0483]\n",
      " [-0.0359 -0.0138  0.0808  0.0518 -0.0027]\n",
      " [-0.0778 -0.0142  0.058   0.0141 -0.0141]]\n",
      "================================\n",
      "\n",
      "[DEBUG] Varianza media zA: 0.0921, zB: 0.0869, zAE: 0.0884\n",
      "[DEBUG] Prime 5×5 celle embedding AE:\n",
      "[[ 0.0065 -0.0316 -0.0139  0.0031 -0.0115]\n",
      " [-0.0204 -0.0564 -0.0212  0.0442  0.0256]\n",
      " [-0.0321  0.0293  0.0398  0.0325  0.0722]\n",
      " [-0.0312 -0.001   0.0551  0.0541  0.067 ]\n",
      " [-0.0276  0.0152  0.0321  0.0311 -0.0224]]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/tools/deg/miniforge3/envs/scvar_env/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
      "  warn("
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] AE similarity=35.875\n",
      "[INFO] === AUTOENCODER INTEGRATION COMPLETATA ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[-23.7148   2.554   54.0891  -0.8168   0.8134]\n",
      " [-32.1915 -40.0711 -16.1456   2.0053   2.1533]\n",
      " [-21.7155   2.5589  52.835    3.8058   0.8215]\n",
      " [-22.7593   2.0298  53.5315  -0.1129  -0.8372]\n",
      " [-31.3252 -38.9129 -18.8992  -0.4091  -0.2577]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n",
      "[DEBUG] Varianza media embedding: 85.4580\n",
      "[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n",
      "[INFO] → Leiden completato su variant (res=0.5) → 4 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 27.5802  12.5513 -13.206   -0.4545   2.8701]\n",
      " [ 28.8187  10.3438 -13.1964  -1.2082  -4.3537]\n",
      " [ -1.777    2.2861  12.3413  30.4629  -2.2863]\n",
      " [ -6.7155 -26.169  -18.2502  -0.6037   0.9084]\n",
      " [ 28.8888  12.6758 -13.6591  -1.4595  -4.2337]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n",
      "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n",
      "[INFO] → Leiden completato su trans (res=0.5) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (int) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 0.0065 -0.0316 -0.0139  0.0031 -0.0115]\n",
      " [-0.0204 -0.0564 -0.0212  0.0442  0.0256]\n",
      " [-0.0321  0.0293  0.0398  0.0325  0.0722]\n",
      " [-0.0312 -0.001   0.0551  0.0541  0.067 ]\n",
      " [-0.0276  0.0152  0.0321  0.0311 -0.0224]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n",
      "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n",
      "[INFO] → Leiden completato su int (res=0.5) → 4 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[-23.7148   2.554   54.0891  -0.8168   0.8134]\n",
      " [-32.1915 -40.0711 -16.1456   2.0053   2.1533]\n",
      " [-21.7155   2.5589  52.835    3.8058   0.8215]\n",
      " [-22.7593   2.0298  53.5315  -0.1129  -0.8372]\n",
      " [-31.3252 -38.9129 -18.8992  -0.4091  -0.2577]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n",
      "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n",
      "[INFO] → Leiden completato su variant (res=1.0) → 4 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 27.5802  12.5513 -13.206   -0.4545   2.8701]\n",
      " [ 28.8187  10.3438 -13.1964  -1.2082  -4.3537]\n",
      " [ -1.777    2.2861  12.3413  30.4629  -2.2863]\n",
      " [ -6.7155 -26.169  -18.2502  -0.6037   0.9084]\n",
      " [ 28.8888  12.6758 -13.6591  -1.4595  -4.2337]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n",
      "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n",
      "[INFO] → Leiden completato su trans (res=1.0) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (int) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 0.0065 -0.0316 -0.0139  0.0031 -0.0115]\n",
      " [-0.0204 -0.0564 -0.0212  0.0442  0.0256]\n",
      " [-0.0321  0.0293  0.0398  0.0325  0.0722]\n",
      " [-0.0312 -0.001   0.0551  0.0541  0.067 ]\n",
      " [-0.0276  0.0152  0.0321  0.0311 -0.0224]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n",
      "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n",
      "[INFO] → Leiden completato su int (res=1.0) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[-23.7148   2.554   54.0891  -0.8168   0.8134]\n",
      " [-32.1915 -40.0711 -16.1456   2.0053   2.1533]\n",
      " [-21.7155   2.5589  52.835    3.8058   0.8215]\n",
      " [-22.7593   2.0298  53.5315  -0.1129  -0.8372]\n",
      " [-31.3252 -38.9129 -18.8992  -0.4091  -0.2577]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n",
      "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n",
      "[INFO] → Leiden completato su variant (res=1.5) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 27.5802  12.5513 -13.206   -0.4545   2.8701]\n",
      " [ 28.8187  10.3438 -13.1964  -1.2082  -4.3537]\n",
      " [ -1.777    2.2861  12.3413  30.4629  -2.2863]\n",
      " [ -6.7155 -26.169  -18.2502  -0.6037   0.9084]\n",
      " [ 28.8888  12.6758 -13.6591  -1.4595  -4.2337]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n",
      "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n",
      "[INFO] → Leiden completato su trans (res=1.5) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (int) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 0.0065 -0.0316 -0.0139  0.0031 -0.0115]\n",
      " [-0.0204 -0.0564 -0.0212  0.0442  0.0256]\n",
      " [-0.0321  0.0293  0.0398  0.0325  0.0722]\n",
      " [-0.0312 -0.001   0.0551  0.0541  0.067 ]\n",
      " [-0.0276  0.0152  0.0321  0.0311 -0.0224]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n",
      "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n",
      "[INFO] → Leiden completato su int (res=1.5) → 12 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (variant) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['variant_X'] (30 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[-23.7148   2.554   54.0891  -0.8168   0.8134]\n",
      " [-32.1915 -40.0711 -16.1456   2.0053   2.1533]\n",
      " [-21.7155   2.5589  52.835    3.8058   0.8215]\n",
      " [-22.7593   2.0298  53.5315  -0.1129  -0.8372]\n",
      " [-31.3252 -38.9129 -18.8992  -0.4091  -0.2577]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(41.214), np.float32(-75.0844), np.float32(36.6839), np.float32(14.1632), np.float32(-99.744)]\n",
      "[DEBUG] Varianza media embedding: 85.4580[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002743, mean_dist=0.040279\n",
      "[INFO] → Leiden completato su variant (res=2.0) → 12 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (trans) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['trans_X'] (50 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 27.5802  12.5513 -13.206   -0.4545   2.8701]\n",
      " [ 28.8187  10.3438 -13.1964  -1.2082  -4.3537]\n",
      " [ -1.777    2.2861  12.3413  30.4629  -2.2863]\n",
      " [ -6.7155 -26.169  -18.2502  -0.6037   0.9084]\n",
      " [ 28.8888  12.6758 -13.6591  -1.4595  -4.2337]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(24.3581), np.float32(26.9298), np.float32(35.7606), np.float32(-48.3718), np.float32(31.0977)]\n",
      "[DEBUG] Varianza media embedding: 26.3937[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.002376, mean_dist=0.046996\n",
      "[INFO] → Leiden completato su trans (res=2.0) → 5 cluster\n",
      "[INFO] === CLUSTERING DONE ===\n",
      "\n",
      "[INFO] === CALC OMiCS CLUSTERS (int) ===\n",
      "[INFO] Numero celle: 5000\n",
      "[INFO] Dataset grande → uso embedding autoencoder (AE-style)\n",
      "[INFO] Embedding caricato da adata.uns['int_X'] (400 dimensioni)\n",
      "[DEBUG] Prime 5×5 celle embedding:\n",
      "[[ 0.0065 -0.0316 -0.0139  0.0031 -0.0115]\n",
      " [-0.0204 -0.0564 -0.0212  0.0442  0.0256]\n",
      " [-0.0321  0.0293  0.0398  0.0325  0.0722]\n",
      " [-0.0312 -0.001   0.0551  0.0541  0.067 ]\n",
      " [-0.0276  0.0152  0.0321  0.0311 -0.0224]]\n",
      "[DEBUG] Somma prime 5 righe: [np.float32(19.3927), np.float32(18.6286), np.float32(19.5757), np.float32(18.5747), np.float32(19.2562)]\n",
      "[DEBUG] Varianza media embedding: 0.0884[DEBUG] Neighbors graph: (5000, 5000), mean_conn=0.001461, mean_dist=0.001989\n",
      "[INFO] → Leiden completato su int (res=2.0) → 21 cluster\n",
      "[INFO] === CLUSTERING DONE ==="
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "# RNA - OK\n",
    "adata = scVAR.transcriptomicAnalysis(\n",
    "    path_10x=in_path,\n",
    "    bcode_variants=barcode_var,\n",
    "    n_pcs=50,                 # oppure None se vuoi usare tutto (più lento/rumoroso)\n",
    ")\n",
    "\n",
    "# VAR - OK\n",
    "adata = scVAR.variantAnalysis(adata, matrix_path=var_mat, bcode_path=barcode_var,\n",
    "                              variants_path=snv, n_pcs=30, variant_filter_level=\"medium\",variant_rep=\"muon\")\n",
    "\n",
    "\n",
    "adata = scVAR.omicsIntegration(\n",
    "    adata,\n",
    "    latent_dim=400,\n",
    "    num_epoch=3500,\n",
    "    lam_align=0.5,\n",
    "    lam_cross=7.7,\n",
    "    lam_recon_a=1.0,\n",
    "    lam_recon_b=0.8,\n",
    ")\n",
    "\n",
    "# Compute transcriptomics, genomics and integrated clusters at different resolutions\n",
    "res_list = [0.5, 1.0, 1.5, 2.0]\n",
    "for res in res_list:\n",
    "    adata = scVAR.calcOmicsClusters(adata, omic_key='variant', res=res)\n",
    "    adata = scVAR.calcOmicsClusters(adata, omic_key='trans', res=res)\n",
    "    adata = scVAR.calcOmicsClusters(adata, omic_key='int', res=res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4e52f021-0ca9-444a-a7b8-df7cd1fcb13d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AnnData object with n_obs × n_vars = 5000 × 10000\n",
      "    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2'\n",
      "    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'\n",
      "    uns: 'trans_raw', 'trans_raw_obs_names', 'trans_raw_var_names', 'log1p', 'pca', 'neighbors', 'umap', 'trans_X', 'variant_raw', 'variant_raw_obs_names', 'variant_raw_var_names', 'variant_X', 'int_X', 'int_metrics', 'variant_neighbors', 'variant_clust_0.5', 'trans_neighbors', 'trans_clust_0.5', 'int_neighbors', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2'\n",
      "    obsm: 'X_pca', 'X_umap', 'trans_pca', 'trans_umap', 'variant_pca', 'variant_umap', 'int_umap', 'variant_X', 'trans_X', 'int_X'\n",
      "    varm: 'PCs'\n",
      "    obsp: 'distances', 'connectivities', 'variant_neighbors_distances', 'variant_neighbors_connectivities', 'trans_neighbors_distances', 'trans_neighbors_connectivities', 'int_neighbors_distances', 'int_neighbors_connectivities'\n",
      "Index(['n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt',\n",
      "       'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5',\n",
      "       'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1',\n",
      "       'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5',\n",
      "       'variant_clust_2', 'trans_clust_2', 'int_clust_2'],\n",
      "      dtype='object')"
     ]
    }
   ],
   "source": [
    "print(adata)\n",
    "print(adata.obs.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "746bbe89-e96f-46e3-b35b-69b89951d8cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'variant_clust_0.5', 'trans_clust_0.5', 'int_clust_0.5', 'variant_clust_1', 'trans_clust_1', 'int_clust_1', 'variant_clust_1.5', 'trans_clust_1.5', 'int_clust_1.5', 'variant_clust_2', 'trans_clust_2', 'int_clust_2']\n",
      "variant_clust_0.5     4\n",
      "trans_clust_0.5       5\n",
      "int_clust_0.5         4\n",
      "variant_clust_1       4\n",
      "trans_clust_1         5\n",
      "int_clust_1           5\n",
      "variant_clust_1.5     5\n",
      "trans_clust_1.5       5\n",
      "int_clust_1.5        12\n",
      "variant_clust_2      12\n",
      "trans_clust_2         5\n",
      "int_clust_2          21\n",
      "Name: n_cluster, dtype: int64"
     ]
    }
   ],
   "source": [
    "print(adata.obs.columns.tolist())\n",
    "import pandas as pd\n",
    "\n",
    "cluster_cols = [c for c in adata.obs.columns if \"clust\" in c]\n",
    "cluster_summary = {\n",
    "    c: adata.obs[c].nunique(dropna=True)\n",
    "    for c in cluster_cols\n",
    "}\n",
    "\n",
    "print(pd.Series(cluster_summary, name=\"n_cluster\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "757bed63-e59b-4cb4-a614-d099a59f8dad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== 🔍 ISPEZIONE DETTAGLIATA DI adata.uns ===\n",
      " • trans_raw: shape=(5000, 10000), dtype=float64\n",
      " • trans_raw_obs_names: shape=(5000,), dtype=object\n",
      " • trans_raw_var_names: shape=(10000,), dtype=object\n",
      " • log1p: type=dict, len=1\n",
      " • pca: type=dict, len=3\n",
      " • neighbors: type=dict, len=3\n",
      " • umap: type=dict, len=1\n",
      " • trans_X: shape=(5000, 50), dtype=float32\n",
      " • variant_raw: shape=(5000, 20000), dtype=int64\n",
      " • variant_raw_obs_names: shape=(5000,), dtype=object\n",
      " • variant_raw_var_names: shape=(20000,), dtype=object\n",
      " • variant_X: shape=(5000, 30), dtype=float32\n",
      " • int_X: shape=(5000, 400), dtype=float32\n",
      " • int_metrics: type=dict, len=7\n",
      " • variant_neighbors: type=dict, len=3\n",
      " • variant_clust_0.5: type=dict, len=2\n",
      " • trans_neighbors: type=dict, len=3\n",
      " • trans_clust_0.5: type=dict, len=2\n",
      " • int_neighbors: type=dict, len=3\n",
      " • int_clust_0.5: type=dict, len=2\n",
      " • variant_clust_1: type=dict, len=2\n",
      " • trans_clust_1: type=dict, len=2\n",
      " • int_clust_1: type=dict, len=2\n",
      " • variant_clust_1.5: type=dict, len=2\n",
      " • trans_clust_1.5: type=dict, len=2\n",
      " • int_clust_1.5: type=dict, len=2\n",
      " • variant_clust_2: type=dict, len=2\n",
      " • trans_clust_2: type=dict, len=2\n",
      " • int_clust_2: type=dict, len=2"
     ]
    }
   ],
   "source": [
    "print(\"=== 🔍 ISPEZIONE DETTAGLIATA DI adata.uns ===\")\n",
    "\n",
    "if hasattr(adata, \"uns\") and len(adata.uns.keys()) > 0:\n",
    "    for key in adata.uns.keys():\n",
    "        obj = adata.uns[key]\n",
    "        # prova a estrarre shape e dtype se è array-like o matrice\n",
    "        shape = getattr(obj, \"shape\", None)\n",
    "        dtype = getattr(obj, \"dtype\", type(obj))\n",
    "        if shape is not None:\n",
    "            print(f\" • {key}: shape={shape}, dtype={dtype}\")\n",
    "        else:\n",
    "            t = type(obj).__name__\n",
    "            size = len(obj) if hasattr(obj, \"__len__\") else \"-\"\n",
    "            print(f\" • {key}: type={t}, len={size}\")\n",
    "else:\n",
    "    print(\"⚠️ Nessun elemento trovato in adata.uns.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a8627c7e-1641-483a-817e-7836fc039d83",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Carico metadati da: ../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv[INFO] Create colonne combinate da 'TrueCellType' + 'TrueGenotype'\n",
      "[INFO] Colonna 'TrueCombo' aggiunta ad adata.obs (5000 celle, 0 mancanti)"
     ]
    }
   ],
   "source": [
    "# === BLOCCO RICARICA GROUND TRUTH (per ARI/NMI) ===\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "meta_csv = \"../tests/sim_c5000_g10000_v20000_ct5_gt4_es0.3_vs0.1_ee0.3_ve0.3_ln0.3/cell_metadata.csv\"\n",
    "\n",
    "if not os.path.exists(meta_csv):\n",
    "    raise FileNotFoundError(f\"[ERRORE] File metadati non trovato: {meta_csv}\")\n",
    "\n",
    "print(f\"[INFO] Carico metadati da: {meta_csv}\")\n",
    "meta = pd.read_csv(meta_csv)\n",
    "\n",
    "# --- Identifica colonna delle celle ---\n",
    "if \"Cell\" in meta.columns:\n",
    "    meta = meta.set_index(\"Cell\")\n",
    "elif \"cell_id\" in meta.columns:\n",
    "    meta = meta.set_index(\"cell_id\")\n",
    "else:\n",
    "    raise ValueError(f\"[ERRORE] Nessuna colonna 'Cell' o 'cell_id' trovata in {meta.columns.tolist()}\")\n",
    "\n",
    "# --- Identifica colonne di tipo e genotipo ---\n",
    "celltype_col = next((c for c in [\"TrueCellType\", \"CellType\", \"cell_type\"] if c in meta.columns), None)\n",
    "genotype_col = next((c for c in [\"TrueGenotype\", \"Genotype\", \"genotype\"] if c in meta.columns), None)\n",
    "\n",
    "if celltype_col is None and genotype_col is None:\n",
    "    raise ValueError(f\"[ERRORE] Nessuna colonna di tipo cellula o genotipo trovata in {meta.columns.tolist()}\")\n",
    "\n",
    "# --- Crea la colonna combinata TrueCombo ---\n",
    "if celltype_col and genotype_col:\n",
    "    meta[\"TrueCombo\"] = meta[celltype_col].astype(str) + \"_\" + meta[genotype_col].astype(str)\n",
    "    print(f\"[INFO] Create colonne combinate da '{celltype_col}' + '{genotype_col}'\")\n",
    "elif celltype_col:\n",
    "    meta[\"TrueCombo\"] = meta[celltype_col].astype(str)\n",
    "    print(f\"[INFO] Usata sola colonna cell_type '{celltype_col}' come TrueCombo\")\n",
    "elif genotype_col:\n",
    "    meta[\"TrueCombo\"] = meta[genotype_col].astype(str)\n",
    "    print(f\"[INFO] Usata sola colonna genotype '{genotype_col}' come TrueCombo\")\n",
    "\n",
    "# --- Unisci con adata.obs ---\n",
    "adata.obs = adata.obs.join(meta[[\"TrueCombo\"]], how=\"left\")\n",
    "\n",
    "# --- Controlli ---\n",
    "if \"TrueCombo\" not in adata.obs.columns:\n",
    "    raise ValueError(\"[ERRORE] Non è stata creata correttamente la colonna TrueCombo.\")\n",
    "n_missing = adata.obs[\"TrueCombo\"].isna().sum()\n",
    "print(f\"[INFO] Colonna 'TrueCombo' aggiunta ad adata.obs ({len(adata.obs)} celle, {n_missing} mancanti)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "baf95f33-2fe8-4766-a855-37959d1163e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Calcolo ARI/NMI Muon-style per le colonne int_clust_*\n",
      "[INFO] Risoluzioni trovate (ordinate): [0.5, 1.0, 1.5, 2.0]\n",
      "[INFO] res=0.5 (int_clust_0.5) → ARI=0.414944 | NMI=0.473148\n",
      "[INFO] res=1.0 (int_clust_1) → ARI=0.532132 | NMI=0.562063[INFO] res=1.5 (int_clust_1.5) → ARI=0.535991 | NMI=0.555448\n",
      "[INFO] res=2.0 (int_clust_2) → ARI=0.297338 | NMI=0.428204\n",
      "\n",
      "=== RISULTATI FINALI (Muon-style) ===\n",
      "   method  resolution       colname  ARI_MuonCombo  NMI_MuonCombo\n",
      "ConcatPCA         0.5 int_clust_0.5       0.414944       0.473148\n",
      "ConcatPCA         1.0   int_clust_1       0.532132       0.562063\n",
      "ConcatPCA         1.5 int_clust_1.5       0.535991       0.555448\n",
      "ConcatPCA         2.0   int_clust_2       0.297338       0.428204"
     ]
    }
   ],
   "source": [
    "# === BLOCCO UNICO: ARI MUON-STYLE ADATTATO A SCVAR (fix colnames) ===\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
    "\n",
    "print(\"[INFO] Calcolo ARI/NMI Muon-style per le colonne int_clust_*\")\n",
    "\n",
    "# --- Trova tutte le colonne int_clust_* ---\n",
    "int_cols = [c for c in adata.obs.columns if c.startswith(\"int_clust_\")]\n",
    "if not int_cols:\n",
    "    raise ValueError(\"Nessuna colonna int_clust_* trovata in adata.obs!\")\n",
    "\n",
    "# --- Verifica presenza colonna TrueCombo ---\n",
    "if \"TrueCombo\" not in adata.obs.columns:\n",
    "    raise ValueError(\"Colonna 'TrueCombo' mancante in adata.obs — impossibile calcolare ARI/NMI.\")\n",
    "\n",
    "# --- Costruisci mappa colonna -> risoluzione (float), evitando startswith ---\n",
    "pairs = []\n",
    "for c in int_cols:\n",
    "    suf = c.replace(\"int_clust_\", \"\", 1)\n",
    "    try:\n",
    "        res = float(suf)\n",
    "    except ValueError:\n",
    "        # gestisce casi tipo int_clust_XYZ: skip con warning\n",
    "        print(f\"[WARN] Suffix non numerico per colonna {c} — salto.\")\n",
    "        continue\n",
    "    pairs.append((res, c))\n",
    "\n",
    "if not pairs:\n",
    "    raise ValueError(\"Nessuna colonna int_clust_* con suffisso numerico valida.\")\n",
    "\n",
    "# Ordina per risoluzione (e poi per nome colonna per stabilità)\n",
    "pairs = sorted(pairs, key=lambda x: (x[0], x[1]))\n",
    "res_values = [p[0] for p in pairs]\n",
    "print(f\"[INFO] Risoluzioni trovate (ordinate): {res_values}\")\n",
    "\n",
    "results_concat = []\n",
    "\n",
    "# --- Loop diretto su (res, colname) già accoppiati correttamente ---\n",
    "for res, colname in pairs:\n",
    "    if colname not in adata.obs.columns:\n",
    "        print(f\"[WARN] Colonna {colname} non presente in adata.obs — salto.\")\n",
    "        continue\n",
    "\n",
    "    valid = adata.obs.dropna(subset=[colname, \"TrueCombo\"])\n",
    "    if valid.empty:\n",
    "        print(f\"[WARN] Nessun dato valido per {colname}.\")\n",
    "        continue\n",
    "\n",
    "    # Calcolo ARI e NMI rispetto a TrueCombo\n",
    "    y_true = valid[\"TrueCombo\"].astype(str)\n",
    "    y_pred = valid[colname].astype(str)\n",
    "    ari = adjusted_rand_score(y_true, y_pred)\n",
    "    nmi = normalized_mutual_info_score(y_true, y_pred)\n",
    "\n",
    "    results_concat.append({\n",
    "        \"method\": \"ConcatPCA\",\n",
    "        \"resolution\": float(res),\n",
    "        \"colname\": colname,\n",
    "        \"ARI_MuonCombo\": float(ari),\n",
    "        \"NMI_MuonCombo\": float(nmi),\n",
    "    })\n",
    "    print(f\"[INFO] res={res} ({colname}) → ARI={ari:.6f} | NMI={nmi:.6f}\")\n",
    "\n",
    "# --- Output finale ---\n",
    "if results_concat:\n",
    "    df_concat = pd.DataFrame(results_concat).sort_values([\"resolution\", \"colname\"]).reset_index(drop=True)\n",
    "    print(\"\\n=== RISULTATI FINALI (Muon-style) ===\")\n",
    "    print(df_concat.to_string(index=False))\n",
    "else:\n",
    "    print(\"[WARN] Nessun risultato ARI calcolato.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:scvar_env]",
   "language": "python",
   "name": "conda-env-scvar_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}