Source code for kbkit.io.gro

"""Parses a GROMACS .gro file to extract residue electron counts and box volume."""

import re
from functools import cached_property
from pathlib import Path
from typing import ClassVar

import MDAnalysis
import numpy as np
from rdkit.Chem import GetPeriodicTable

from kbkit.utils.validation import validate_path


[docs] class GroParser: """ Parse a single GROMACS .gro file to compute valence electron counts and box volume. Parameters ---------- gro_path: str Path to the .gro file. """ MAX_SYMBOL_LENGTH: ClassVar = 2 def __init__(self, path: str | Path) -> None: valid_path = validate_path(path, suffix=".gro") try: self._universe = MDAnalysis.Universe(valid_path) except Exception: # only reformat if mda doesn't work. self._universe = MDAnalysis.Universe(self._resolve_gro_formatting(valid_path)) def _resolve_gro_formatting(self, input_file: Path) -> str: with open(input_file, "r") as f: lines = f.readlines() # The .gro format: # 1. Title (Line 1) # 2. Number of atoms (Line 2) # 3. Atom data (Line 3 to N+2) # 4. Box vectors (Last line) reformatted = [lines[0], lines[1]] # Keep header and atom count # Process only the atom lines (skip header/footer) for line in lines[2:-1]: parts = line.split() MAGIC_SIX = 6 if len(parts) < MAGIC_SIX: continue # Skip malformed empty lines # Handle cases where residue ID might be mixed with residue name try: res_id = int(parts[0]) res_name = parts[1] atom_name = parts[2] atom_id = int(parts[3]) x, y, z = float(parts[4]), float(parts[5]), float(parts[6]) except ValueError as e: # If first part contains non-numeric characters, extract numeric part res_id_match = re.search(r"\d+", parts[0]) if res_id_match: res_id = int(res_id_match.group()) # Extract residue name from the same field res_name_match = re.search(r"[A-Za-z]+", parts[0]) res_name = res_name_match.group() if res_name_match else "UNK" atom_name = parts[1] atom_id = int(parts[2]) x, y, z = float(parts[3]), float(parts[4]), float(parts[5]) else: raise ValueError(f"Could not parse residue ID from: '{parts[0]}'") from e # Strict GROMACS fixed-column format: # %5d%-5s%5s%5d%8.3f%8.3f%8.3f new_line = f"{res_id:>5}{res_name:<5}{atom_name:>5}{atom_id:>5}{x:>8.3f}{y:>8.3f}{z:>8.3f}\n" reformatted.append(new_line) reformatted.append(lines[-1]) # Add box vectors back output_file = Path(str(input_file).strip(".gro") + "_reformatted.gro") with open(output_file, "w") as f: f.writelines(reformatted) return str(output_file) @property def residues(self) -> MDAnalysis.core.groups.ResidueGroup: """MDAnalysis.core.groups.ResidueGroup: Residues in the .gro file.""" return self._universe.residues @property def molecule_count(self) -> dict[str, int]: """dict[str, int]: Unique molecule residues and corresponding counts.""" resnames, counts = np.unique(self.residues.resnames, return_counts=True) mol_dict = {res: int(count) for res, count in zip(resnames, counts, strict=False)} return mol_dict @property def molecules(self) -> list[str]: """list[str]: Names of molecules present.""" return list(self.molecule_count.keys()) @property def total_molecules(self) -> int: """int: Total number of molecules present.""" return sum(self.molecule_count.values()) @property def atom_counts(self) -> dict[str, dict[str, int]]: """dict[str, dict[str, int]]: Dictionary of residue names and their atom type counts.""" atoms_counts = {} for res in self.residues: if res.resname in atoms_counts: continue unique_atoms, counts = np.unique(res.atoms.types, return_counts=True) atoms_counts[res.resname] = {atom: int(count) for atom, count in zip(unique_atoms, counts, strict=False)} return atoms_counts @cached_property def electron_count(self) -> dict[str, int]: """dict[str, int]: Dictionary of residue types and their total electron count.""" residue_electrons = {} for resname, atom_dict in self.atom_counts.items(): total_electrons = sum(self.get_atomic_number(atom) * count for atom, count in atom_dict.items()) residue_electrons[resname] = total_electrons return residue_electrons @cached_property def box_volume(self) -> float: """float: Compute box volume (nm^3) from the last line of a GROMACS .gro file.""" box_A = np.asarray(self._universe.dimensions[:3]) box_nm = box_A / 10.0 # convert from Angstroms to nm volume_nm3 = np.prod(box_nm) return float(volume_nm3)
[docs] @staticmethod def is_valid_element(symbol: str) -> bool: """ Check if a string is a valid chemical element symbol. Parameters ---------- symbol : str Element symbol to validate (e.g., 'C', 'Na', 'Cl'). Returns ------- bool True if valid element, False otherwise. """ if not symbol or not isinstance(symbol, str): return False symbol = symbol.strip().capitalize() if len(symbol) > GroParser.MAX_SYMBOL_LENGTH: symbol = symbol[:2] ptable = GetPeriodicTable() return ptable.GetAtomicNumber(symbol) > 0
[docs] @staticmethod def get_atomic_number(symbol: str) -> int: """ Return atomic number of a valid element symbol. Parameters ---------- symbol : str Valid element symbol (e.g., 'C', 'Na', 'Cl'). Returns ------- int Atomic number of the element. """ symbol = symbol.strip().capitalize() # checks that symbol is a valid element if GroParser.is_valid_element(symbol): ptable = GetPeriodicTable() return ptable.GetAtomicNumber(symbol) else: raise ValueError(f"Symbol '{symbol}' is not a valid element.")