Source code for kbkit.io.top
"""Parses a GROMACS .top file to extract molecule names and their counts from the [ molecules ] section."""
import re
from functools import cached_property
from pathlib import Path
from typing import Any
import numpy as np
from kbkit.utils.validation import validate_path
MAX_MOLECULE_PARTS = 2
[docs]
class TopParser:
"""
Parses GROMACS topology file to get molecules present and their counts.
Parameters
----------
top_path : str
Path to the topology (.top) file.
"""
def __init__(self, path: str | Path) -> None:
self.filepath = validate_path(path, suffix=".top")
self.skipped_lines: list[Any] = []
def _is_valid_molecule_name(self, name: str) -> bool:
# Allow letters, numbers, underscores, and hyphens
return bool(re.match(r"^[A-Za-z0-9_\-]{2,50}$", name))
def _is_valid_count(self, count: str) -> bool:
# check if string is valid number
return count.isdigit()
[docs]
def parse(self) -> None:
"""Read the topology file and returns a dictionary of molecule names and counts.
Returns
-------
dict
Dictionary containing molecules present and their number.
"""
lines = self.filepath.read_text().splitlines()
molecules = {}
in_molecules_section = False
# extract molecule name and numbers from file
for _line_num, original_line in enumerate(lines, start=1):
# Remove comments (anything after a semicolon) and leading/trailing whitespace
line = original_line.split(";")[0].strip()
if not line:
continue # Skip empty lines
# search for 'molecules' line
if line.lower().startswith("[ molecules ]"):
in_molecules_section = True
continue
if in_molecules_section:
if line.startswith("["):
break # Stop parsing if we encounter another section
# Split the line by spaces and tabs, filtering out empty strings
parts = re.split(r"\s+", line)
if len(parts) < MAX_MOLECULE_PARTS:
self.skipped_lines.append((original_line, "Missing molecule name or count"))
continue
molecule_name, count_str = parts[0], parts[1]
if not self._is_valid_molecule_name(molecule_name):
self.skipped_lines.append((original_line, "Invalid molecule name format"))
continue
if not self._is_valid_count(count_str):
self.skipped_lines.append((original_line, "Invalid molecule count"))
continue
molecules[molecule_name] = int(count_str)
if not molecules:
raise ValueError("No molecules found in topology file.")
self._molecule_count = molecules
[docs]
def report_skipped(self) -> None:
"""Print a summary of lines that were skipped during parsing, including the line content and the reason for skipping."""
if self.skipped_lines:
print("Skipped lines during parsing:")
for line, reason in self.skipped_lines:
print(f" Line: '{line}' => Reason: {reason}")
@cached_property
def molecule_count(self) -> dict[str, int]:
"""dict[str, int]: Dictionary of molecules present and their corresponding numbers."""
if "_molecule_count" not in self.__dict__:
self.parse()
return self._molecule_count
@property
def molecules(self) -> list[str]:
"""list[str]: Names of molecules present."""
return list(self.molecule_count.keys())
@property
def total_molecules(self) -> int:
"""int: Total number of molecules present."""
return sum(self.molecule_count.values())
@property
def electron_count(self) -> dict[str, int]:
"""dict: Empty dict of electron counts."""
return {}
@property
def box_volume(self) -> float:
"""float: NaN value for box volume."""
return np.nan