Source code for kbkit.systems.properties

"""Unified interface for extracting molecular and system-level properties from GROMACS input files."""

import inspect
from functools import cached_property
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

from kbkit.config.unit_registry import load_unit_registry
from kbkit.io import EdrParser, GroParser, TopParser
from kbkit.utils.format import ENERGY_ALIASES, resolve_attr_key
from kbkit.utils.validation import validate_path
from kbkit.visualization.timeseries import TimeseriesPlotter


[docs] class SystemProperties: """ Interface for accessing thermodynamic and structural properties of a GROMACS system. Combines topology (.top), structure (.gro), and energy (.edr) files into a unified property accessor. Supports alias resolution, unit conversion, and ensemble-aware file discovery. Parameters ---------- system_path : str or Path, optional Path to the system directory containing GROMACS files. include : str, optional String to include in file name for valid file. Only used if multiple files are found with the same suffix. edr_path: str or Path, optional Path for an energy (.edr) file. top_path: str or Path, optional Path for a topology (.top) file. gro_path: str or Path, optional Path for a structure (.gro) file. start_time : int, optional Default start time (in ps) for time-averaged properties. Attributes ---------- edr_paths: list[Path] List of paths to GROMACS energy files. top_paths: list[Path] List of paths to GROMACS topology files. gro_paths: list[Path] List of paths to GROMACS structure files. .. note:: - Defaults to looking at files/paths directly specified. - If files are not specified or do not exist, a ``system_path`` is required to locate the files with necessary suffix. """ def __init__( self, system_path: str | None = None, include: str = "", edr_path: str | None = None, top_path: str | None = None, gro_path: str | None = None, start_time: float = 0.0, ) -> None: self.start_time = float(start_time) # setup registry for unit conversions self.ureg = load_unit_registry() # Load the unit registry for unit conversions self.Q_ = self.ureg.Quantity # validate system paths self.system_path = validate_path(system_path) if system_path else system_path # get files; first prioritize specified file; then search directory if files do not exist self.edr_paths = self.find_files( suffix=".edr", include=include, filepath=edr_path, system_path=self.system_path ) self.gro_paths = self.find_files( suffix=".gro", include=include, filepath=gro_path, system_path=self.system_path ) # this is not required! try: self.top_paths = self.find_files( suffix=".top", include=include, filepath=top_path, system_path=self.system_path ) except ValueError: if len(self.gro_paths) > 0: self.top_paths = [] else: raise # update system-path if not defined and all files have same parent if self.system_path is None: e_parents = [f.parent for f in self.edr_paths] g_parents = [f.parent for f in self.gro_paths] t_parents = [f.parent for f in self.top_paths] parents = np.unique([e_parents + g_parents + t_parents]) if len(parents) == 1: self.system_path = parents[0]
[docs] @staticmethod def find_files( suffix: str, filepath: str | Path | None = None, system_path: str | Path | None = None, include: str = "", exclude: list[str] | None = None, ) -> list[Path]: """ Get list of files with a given suffix in system directory. Parameters ---------- suffix: str File type to search for. (i.e., `.edr`, `.gro`, `.top`) filepath: str, optional Optional filepath to validate. Optional if ``system_path`` is included. system_path: str, optional Parent path containing files. Optional if ``filepath`` is included. include: str, optional String to filter files by. Will only incorporate if more than one file of the desired suffix is found. exclude: list[str], optional String to exclude from valid files. Will only be searched if more than 1 files found after ``include`` filter. Returns ------- list[Path] List of path objects containing files of desired suffix. """ # validate filepath and parent directory if filepath: filepath = validate_path(filepath, suffix) if system_path is None: system_path = validate_path(filepath.parent) else: system_path = validate_path(system_path) elif system_path: system_path = validate_path(system_path) else: raise ValueError("A valid 'filepath' or 'system_path' is required!") # get files files = [filepath] if filepath else sorted(system_path.glob(f"*.{suffix.strip('.')}")) if len(files) == 1: return files # filter files by ``include`` argument if more than one files are found. files_filtered = [f for f in list(files) if include in f.name] if not files_filtered: return files # refine files 1 more time by things not to include; i.e., inital runs exclude = exclude or ["init", "eq", "em"] files_filtered_again = sorted([f for f in files_filtered if not any(x in f.name for x in exclude)]) return files_filtered if not files_filtered_again else files_filtered_again
@cached_property def energy(self) -> list[EdrParser]: """list[EdrParser]: Setup EDR file parsers for all files in ``edr_paths``.""" if len(self.edr_paths) == 0: raise FileNotFoundError("Energy file(s) do not exist!") return [EdrParser(Path(fpath)) for fpath in self.edr_paths] @cached_property def topology(self) -> GroParser | TopParser: """GroParser | TopParser: Setup Gro/Top parser.""" # prioritize GRO files -- they contain electron info as well if len(self.gro_paths) > 0: for file in self.gro_paths: gro = GroParser(file) if any(len(mol) > 1 for mol in gro.molecules): return gro # if gro file(s) are invalid; use topology file if len(self.top_paths) > 0: for file in self.top_paths: return TopParser(file) raise FileNotFoundError(f"No topology or structure file found in '{self.system_path}'") @property def topology_properties(self) -> list[str]: """list[str]: Get list of accessible topology properties.""" return [name for name, _ in inspect.getmembers(self.topology) if not name.startswith("_")]
[docs] def get(self, name: str, units: str | None = None, avg: bool = True, time_series: bool = False) -> Any: """ Master function for getting any property from ``energy`` or ``topology`` files. Parameters ---------- name: str Name for the property to extract. units: str, optional Units to convert energy properties to. If not specified, default units from `pyedr` will be used. avg: bool, optional Returns averaged property if True (default: True). Otherwise returns array of values. time_series: bool, optional Returns both times and values if True (default: False). Returns ------- float | np.ndarray | list[np.ndarray] Topology or energy property in desired units. """ # 1. if property is in topology; return it if name in self.topology_properties: return getattr(self.topology, name) # now triple check if electrons are desired but another name is used if any(xx in name.lower() for xx in ("elec", "z_", "z-")): return self.topology.electron_count # 2. now for energy properties # first check if property are units; in which case just return unit dictionary if "unit" in name.lower(): return self.energy[0].units box_volume = self.topology.box_volume # resolves common property names for all EDR properties prop = resolve_attr_key(name, ENERGY_ALIASES).lower() # now compute properties from edr-files times: list[float] = [] values = [] value: float | np.ndarray for _i, edr in enumerate(self.energy): if prop == "cp": value = edr.cp( nmol=self.topology.total_molecules, volume=box_volume, start_time=self.start_time, units=units ) elif prop == "cv": value = edr.cv(nmol=self.topology.total_molecules, start_time=self.start_time, units=units) elif prop == "enthalpy": value = edr.molar_enthalpy( nmol=self.topology.total_molecules, volume=box_volume, start_time=self.start_time, units=units ) elif prop == "isothermal-compressibility": value = edr.isothermal_compressibility(start_time=self.start_time, units=units) elif prop in ("number-density", "molar-volume"): # get molar volume and convert to number density if desired units = units or edr.units["molar-volume"] units = units if prop == "molar-volume" else f"{units.split('/')[1]}/{units.split('/')[0]}" Vi = edr.molar_volume( nmol=self.topology.total_molecules, volume=box_volume, start_time=self.start_time, units=units ) if prop == "number-density": value = 1 / Vi else: value = Vi else: value = edr.get(prop, start_time=self.start_time, units=units) # now average values if desired if avg or prop in EdrParser.FLUCT_PROPS: values.append(np.mean(value)) elif isinstance(value, (list | np.ndarray)): values.extend(value) times.extend(edr.get("time", start_time=self.start_time)) # return desired values if avg or prop in EdrParser.FLUCT_PROPS: # Add check before computing mean if len(values) > 0: return float(np.mean(values)) else: return np.nan else: # place into pd.DataFrame and sort by times; if any duplicates are found--remove them df = pd.DataFrame({"times": times, "values": values}) df.sort_values("times", inplace=True) df.drop_duplicates(subset=["times"], keep="first", inplace=True) arr = df.to_numpy() # return times and values if desired if time_series: return arr[:, 0], arr[:, 1] else: return arr[:, 1]
[docs] def timeseries_plotter(self, start_time: int = 0) -> TimeseriesPlotter: """ Create a TimeseriesPlotter for visualizing time series data for a given system. Parameters ---------- start_time: int Initial time for plotting. Returns ------- TimeseriesPlotter Plotter instance for computing simulation energy properties. """ return TimeseriesPlotter(self, start_time=start_time)