Source code for kbkit.io.gro

"""Parses a GROMACS .gro file to extract residue electron counts and box volume."""

import re
from functools import cached_property
from pathlib import Path
from typing import ClassVar

import MDAnalysis
import numpy as np
from rdkit.Chem import GetPeriodicTable

from kbkit.utils.validation import validate_path



[docs]
class GroParser:
    """
    Parse a single GROMACS .gro file to compute valence electron counts and box volume.

    Parameters
    ----------
    gro_path: str
        Path to the .gro file.
    """

    MAX_SYMBOL_LENGTH: ClassVar = 2

    def __init__(self, path: str | Path) -> None:
        valid_path = validate_path(path, suffix=".gro")
        try:
            self._universe = MDAnalysis.Universe(valid_path)
        except Exception:  # only reformat if mda doesn't work.
            self._universe = MDAnalysis.Universe(self._resolve_gro_formatting(valid_path))

    def _resolve_gro_formatting(self, input_file: Path) -> str:
        with open(input_file, "r") as f:
            lines = f.readlines()

        # The .gro format:
        # 1. Title (Line 1)
        # 2. Number of atoms (Line 2)
        # 3. Atom data (Line 3 to N+2)
        # 4. Box vectors (Last line)

        reformatted = [lines[0], lines[1]]  # Keep header and atom count

        # Process only the atom lines (skip header/footer)
        for line in lines[2:-1]:
            parts = line.split()
            MAGIC_SIX = 6
            if len(parts) < MAGIC_SIX:
                continue  # Skip malformed empty lines

            # Handle cases where residue ID might be mixed with residue name
            try:
                res_id = int(parts[0])
                res_name = parts[1]
                atom_name = parts[2]
                atom_id = int(parts[3])
                x, y, z = float(parts[4]), float(parts[5]), float(parts[6])
            except ValueError as e:
                # If first part contains non-numeric characters, extract numeric part

                res_id_match = re.search(r"\d+", parts[0])
                if res_id_match:
                    res_id = int(res_id_match.group())
                    # Extract residue name from the same field
                    res_name_match = re.search(r"[A-Za-z]+", parts[0])
                    res_name = res_name_match.group() if res_name_match else "UNK"
                    atom_name = parts[1]
                    atom_id = int(parts[2])
                    x, y, z = float(parts[3]), float(parts[4]), float(parts[5])
                else:
                    raise ValueError(f"Could not parse residue ID from: '{parts[0]}'") from e

            # Strict GROMACS fixed-column format:
            # %5d%-5s%5s%5d%8.3f%8.3f%8.3f
            new_line = f"{res_id:>5}{res_name:<5}{atom_name:>5}{atom_id:>5}{x:>8.3f}{y:>8.3f}{z:>8.3f}\n"
            reformatted.append(new_line)

        reformatted.append(lines[-1])  # Add box vectors back

        output_file = Path(str(input_file).strip(".gro") + "_reformatted.gro")
        with open(output_file, "w") as f:
            f.writelines(reformatted)

        return str(output_file)

    @property
    def residues(self) -> MDAnalysis.core.groups.ResidueGroup:
        """MDAnalysis.core.groups.ResidueGroup: Residues in the .gro file."""
        return self._universe.residues

    @property
    def molecule_count(self) -> dict[str, int]:
        """dict[str, int]: Unique molecule residues and corresponding counts."""
        resnames, counts = np.unique(self.residues.resnames, return_counts=True)
        mol_dict = {res: int(count) for res, count in zip(resnames, counts, strict=False)}
        return mol_dict

    @property
    def molecules(self) -> list[str]:
        """list[str]: Names of molecules present."""
        return list(self.molecule_count.keys())

    @property
    def total_molecules(self) -> int:
        """int: Total number of molecules present."""
        return sum(self.molecule_count.values())

    @property
    def atom_counts(self) -> dict[str, dict[str, int]]:
        """dict[str, dict[str, int]]: Dictionary of residue names and their atom type counts."""
        atoms_counts = {}
        for res in self.residues:
            if res.resname in atoms_counts:
                continue
            unique_atoms, counts = np.unique(res.atoms.types, return_counts=True)
            atoms_counts[res.resname] = {atom: int(count) for atom, count in zip(unique_atoms, counts, strict=False)}
        return atoms_counts

    @cached_property
    def electron_count(self) -> dict[str, int]:
        """dict[str, int]: Dictionary of residue types and their total electron count."""
        residue_electrons = {}
        for resname, atom_dict in self.atom_counts.items():
            total_electrons = sum(self.get_atomic_number(atom) * count for atom, count in atom_dict.items())
            residue_electrons[resname] = total_electrons
        return residue_electrons

    @cached_property
    def box_volume(self) -> float:
        """float: Compute box volume (nm^3) from the last line of a GROMACS .gro file."""
        box_A = np.asarray(self._universe.dimensions[:3])
        box_nm = box_A / 10.0  # convert from Angstroms to nm
        volume_nm3 = np.prod(box_nm)
        return float(volume_nm3)


[docs]
    @staticmethod
    def is_valid_element(symbol: str) -> bool:
        """
        Check if a string is a valid chemical element symbol.

        Parameters
        ----------
        symbol : str
            Element symbol to validate (e.g., 'C', 'Na', 'Cl').

        Returns
        -------
        bool
            True if valid element, False otherwise.
        """
        if not symbol or not isinstance(symbol, str):
            return False

        symbol = symbol.strip().capitalize()
        if len(symbol) > GroParser.MAX_SYMBOL_LENGTH:
            symbol = symbol[:2]

        ptable = GetPeriodicTable()
        return ptable.GetAtomicNumber(symbol) > 0



[docs]
    @staticmethod
    def get_atomic_number(symbol: str) -> int:
        """
        Return atomic number of a valid element symbol.

        Parameters
        ----------
        symbol : str
            Valid element symbol (e.g., 'C', 'Na', 'Cl').

        Returns
        -------
        int
            Atomic number of the element.
        """
        symbol = symbol.strip().capitalize()
        # checks that symbol is a valid element
        if GroParser.is_valid_element(symbol):
            ptable = GetPeriodicTable()
            return ptable.GetAtomicNumber(symbol)
        else:
            raise ValueError(f"Symbol '{symbol}' is not a valid element.")