Source code for prolint.utils.utils

"""Utility functions for ProLint.

This module provides optimized utility functions for contact
computation and data processing.
"""

import numpy as np


[docs] def fast_contiguous_segment_lengths(arr, multiplier: float = 1.0) -> np.ndarray: """Compute lengths of contiguous segments in a sorted array. Parameters ---------- arr : array-like Sorted array of frame indices. multiplier : float, default=1.0 Factor to multiply segment lengths by. Returns ------- np.ndarray Array of segment lengths (contact durations). """ if len(arr) == 0: return np.array([]) # Calculate the differences between consecutive elements diffs = np.diff(arr) # Find the indices where the difference is greater than 1 split_indices = np.where(diffs > 1)[0] # Calculate the segment lengths directly from the split_indices array using slicing segment_lengths = np.empty(split_indices.size + 1, dtype=int) if split_indices.size == 0: segment_lengths[0] = len(arr) return segment_lengths * multiplier segment_lengths[0] = split_indices[0] + 1 segment_lengths[-1] = len(arr) - split_indices[-1] - 1 segment_lengths[1:-1] = np.diff(split_indices) # - 1 return segment_lengths * multiplier
[docs] def fast_unique_comparison(residue_ids, database_ids, database_names): """Find unique residue-database pairs efficiently. Given parallel arrays of residue IDs, database IDs, and database names, returns the unique (residue_id, database_id) pairs with corresponding names. Parameters ---------- residue_ids : np.ndarray Array of residue IDs. database_ids : np.ndarray Array of database molecule IDs. database_names : np.ndarray Array of database residue names. Returns ------- tuple of np.ndarray (unique_residue_ids, unique_database_ids, unique_database_names) """ # Handle empty input if len(residue_ids) == 0: return np.array([], dtype=residue_ids.dtype), np.array([], dtype=database_ids.dtype), np.array([], dtype=database_names.dtype) # Combine the arrays into a single 2D array combined_array = np.stack((residue_ids, database_ids), axis=-1) # Get lexicographically sorted indices lex_sorted_indices = np.lexsort((combined_array[:, 1], combined_array[:, 0])) # Sort the combined array by the sorted indices sorted_array = combined_array[lex_sorted_indices] # Calculate row-wise differences between consecutive sorted rows row_diffs = np.diff(sorted_array, axis=0) # Find the indices where the differences are non-zero unique_indices = np.where(np.any(row_diffs != 0, axis=1))[0] # Add the first index (0) to unique_indices, as it's always unique unique_indices = np.concatenate(([0], unique_indices + 1)) # Extract the unique rows using the indices unique_array = sorted_array[unique_indices] # Split the unique rows back into residue_ids and database_ids unique_residue_ids, unique_database_ids = unique_array[:, 0], unique_array[:, 1] # Extract the corresponding database_names using the sorted indices sorted_database_names = database_names[lex_sorted_indices] unique_database_names = sorted_database_names[unique_indices] return unique_residue_ids, unique_database_ids, unique_database_names