Source code for seam.compiler

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial import distance

[docs] class Compiler: """ Compiler: A utility for compiling sequence analysis data into a standardized format This implementation processes sequence data and associated metrics into a pandas DataFrame containing: - DNN predictions - Hamming distances (if reference sequence provided in x_ref) - Global Importance Analysis (GIA) scores (if background predictions provided in y_bg) - Sequence strings Requirements: - numpy - pandas - scipy """
[docs] def __init__(self, x, y, x_ref=None, y_bg=None, alphabet=None, gpu=False): """Initialize the Compiler. Args: x: One-hot sequences of shape (N, L, A) y: DNN predictions of shape (N, 1) x_ref: Optional reference sequence of shape (1, L, A) y_bg: Optional background predictions of shape (N, 1) alphabet: List of characters for sequence conversion (e.g., ['A', 'C', 'G', 'T']) gpu: Whether to use GPU-accelerated sequence conversion (default: False) """ self.x = x self.y = y self.x_ref = x_ref self.y_bg = y_bg self.alphabet = alphabet or ['A','C','G','T'] self.gpu = gpu # Validate inputs self._validate_inputs()
def _validate_inputs(self): """Validate input shapes and types.""" if len(self.x.shape) != 3: raise ValueError("x must be 3D array of shape (N, L, A)") if len(self.y.shape) != 2: self.y = self.y.reshape(-1, 1) if self.x_ref is not None and len(self.x_ref.shape) != 3: raise ValueError("x_ref must be 3D array of shape (1, L, A)") if self.y_bg is not None and len(self.y_bg.shape) != 2: self.y_bg = self.y_bg.reshape(-1, 1) # Check that only one analysis type is requested if self.x_ref is not None and self.y_bg is not None: raise ValueError("Cannot provide both x_ref and y_bg. Choose either Hamming distance analysis (x_ref) or GIA analysis (y_bg) for a local or global library, respectively.") def _oh2seq_cpu(self, one_hot): """Convert one-hot encoding to sequence using CPU.""" seq_index = np.argmax(one_hot, axis=1) alphabet_dict = dict(enumerate(self.alphabet)) seq = [alphabet_dict[i] for i in seq_index] return ''.join(seq) def _oh2seq_gpu(self, x): """Convert batch of one-hot encodings to sequences using GPU.""" # Get indices of 1s for all sequences at once seq_indices = np.argmax(x, axis=-1) # Create mapping of indices to alphabet characters num2alpha = dict(enumerate(self.alphabet)) # Vectorize the conversion seq_chars = np.vectorize(num2alpha.get)(seq_indices) # Convert to list of sequences sequences = [''.join(seq) for seq in seq_chars] return sequences def _compute_hamming_onehot(self, ref_onehot, seq_onehot): """Compute Hamming distances directly from one-hot encodings. Args: ref_onehot: Reference sequence one-hot encoding (L, A) seq_onehot: Query sequence(s) one-hot encoding (N, L, A) Returns: Array of Hamming distances """ # Get indices of 1s (i.e., which nucleotide is present at each position) ref_indices = np.argmax(ref_onehot, axis=1) # Shape: (L,) seq_indices = np.argmax(seq_onehot, axis=2) # Shape: (N, L) # Compare indices and sum differences if seq_indices.ndim == 2: # Multiple sequences case return np.sum(ref_indices != seq_indices, axis=1) else: # Single sequence case return np.sum(ref_indices != seq_indices) def _compute_gia(self, y_pred, y_bg): """Compute GIA score.""" return y_pred - y_bg
[docs] def compile(self): """Compile data into pandas DataFrame.""" print("Compiling data...") # Initialize DataFrame N = len(self.x) df = pd.DataFrame() # Convert sequences and add DNN predictions print("Converting sequences...") if self.gpu: sequences = self._oh2seq_gpu(self.x) else: sequences = [] for i in tqdm(range(N), desc='Processing'): seq = self._oh2seq_cpu(self.x[i]) sequences.append(seq) df['DNN'] = self.y.flatten() df['Sequence'] = sequences # Compute Hamming distances if reference provided if self.x_ref is not None: print("Computing Hamming distances...") # Use one-hot method directly with the original one-hot encodings df['Hamming'] = self._compute_hamming_onehot(self.x_ref[0], self.x) # Compute GIA scores if background provided if self.y_bg is not None: print("Computing GIA scores...") df['GIA'] = self._compute_gia(self.y, self.y_bg).flatten() # Reorder columns cols = ['DNN'] if 'Hamming' in df.columns: cols.append('Hamming') if 'GIA' in df.columns: cols.append('GIA') cols.append('Sequence') return df[cols]