Source code for kbkit.io.top

"""Parses a GROMACS .top file to extract molecule names and their counts from the [ molecules ] section."""

import re
from functools import cached_property
from pathlib import Path
from typing import Any

import numpy as np

from kbkit.utils.validation import validate_path

MAX_MOLECULE_PARTS = 2


[docs] class TopParser: """ Parses GROMACS topology file to get molecules present and their counts. Parameters ---------- top_path : str Path to the topology (.top) file. """ def __init__(self, path: str | Path) -> None: self.filepath = validate_path(path, suffix=".top") self.skipped_lines: list[Any] = [] def _is_valid_molecule_name(self, name: str) -> bool: # Allow letters, numbers, underscores, and hyphens return bool(re.match(r"^[A-Za-z0-9_\-]{2,50}$", name)) def _is_valid_count(self, count: str) -> bool: # check if string is valid number return count.isdigit()
[docs] def parse(self) -> None: """Read the topology file and returns a dictionary of molecule names and counts. Returns ------- dict Dictionary containing molecules present and their number. """ lines = self.filepath.read_text().splitlines() molecules = {} in_molecules_section = False # extract molecule name and numbers from file for _line_num, original_line in enumerate(lines, start=1): # Remove comments (anything after a semicolon) and leading/trailing whitespace line = original_line.split(";")[0].strip() if not line: continue # Skip empty lines # search for 'molecules' line if line.lower().startswith("[ molecules ]"): in_molecules_section = True continue if in_molecules_section: if line.startswith("["): break # Stop parsing if we encounter another section # Split the line by spaces and tabs, filtering out empty strings parts = re.split(r"\s+", line) if len(parts) < MAX_MOLECULE_PARTS: self.skipped_lines.append((original_line, "Missing molecule name or count")) continue molecule_name, count_str = parts[0], parts[1] if not self._is_valid_molecule_name(molecule_name): self.skipped_lines.append((original_line, "Invalid molecule name format")) continue if not self._is_valid_count(count_str): self.skipped_lines.append((original_line, "Invalid molecule count")) continue molecules[molecule_name] = int(count_str) if not molecules: raise ValueError("No molecules found in topology file.") self._molecule_count = molecules
[docs] def report_skipped(self) -> None: """Print a summary of lines that were skipped during parsing, including the line content and the reason for skipping.""" if self.skipped_lines: print("Skipped lines during parsing:") for line, reason in self.skipped_lines: print(f" Line: '{line}' => Reason: {reason}")
@cached_property def molecule_count(self) -> dict[str, int]: """dict[str, int]: Dictionary of molecules present and their corresponding numbers.""" if "_molecule_count" not in self.__dict__: self.parse() return self._molecule_count @property def molecules(self) -> list[str]: """list[str]: Names of molecules present.""" return list(self.molecule_count.keys()) @property def total_molecules(self) -> int: """int: Total number of molecules present.""" return sum(self.molecule_count.values()) @property def electron_count(self) -> dict[str, int]: """dict: Empty dict of electron counts.""" return {} @property def box_volume(self) -> float: """float: NaN value for box volume.""" return np.nan