Source code for pharmaforge.stats.comparator

import numpy as np

from pharmaforge.database import Loader
from pharmaforge.queries.query import Query

[docs] class Comparator: """ A class to do operations on data stored in the database, such as comparing data from different levels of theory. Attributes ---------- client : pharmaforge.database.Loader The MongoDB client object used to connect to the database. Parameters ---------- client_addr : str, optional The address of the MongoDB client. Default is None, which uses the default address "mongodb://localhost:27017". database_name : str, optional The name of the MongoDB database. Default is None. collection_name : str, optional The name of the MongoDB collection. Default is None. """ def __init__(self, client_addr=None, database_name=None, collection_name=None): self.client = Loader(client_addr=client_addr, database_name=database_name, collection_name=collection_name)
[docs] def calculate(self, first_level, second_level, quantity="energies", query="nmols gt 0", comparetype="MAE", verbose=False): """ Calculate the mean absolute error or mean squared error between two levels of theory for a given quantity. This function compares the data from two different levels of theory for a given quantity (e.g., eneries or forces) and then calculates the mean absolute error (MAE) or mean squared error (MSE) between them. The comparison is done for all molecules in the database that match the given query, assuming that both molecules have the same data present (some molecules may not have data for both levels of theory, or SCF may not have converged during the relabeling of data. Parameters ---------- first_level : str The first level of theory. second_level : str The second level of theory. quantity : str, optional The quantity to compare. Default is "energies". Other option is "forces". query : str, optional The query to filter the data. Default is "nmols gt 0". This is a MongoDB query string. Returns ------- float The mean absolute error between the two levels of theory for the given quantity. Raises ------ ValueError If the database is not found or if the query is invalid. Example ------- >>> from pharmaforge.stats.comparator import Comparator >>> compare = Comparator( ... client_addr="mongodb://localhost:27017/", ... database_name="QDPi1_Database", ... ) >>> results = compare.calculate( ... first_level="wB97X/6-31G*-DFTB3", ... second_level="QDPi1", ... quantity="forces", ... query="nmols gt 0", ... comparetype="MAE" ... ) """ CompOp = None if comparetype == "MAE": CompOp = Comparator._MAE elif comparetype == "MSE": CompOp = Comparator._MSE if self.client.db is None: raise ValueError("Database not found. Please check the database name.") q = Query(query) all_comp = {} for collection in self.client.collections: result = 0.0 count = 0 results = q.apply(self.client.db, collection, verbose=verbose) if results is None: continue for mol in results: mol_id = mol.get("molecule_id") if first_level not in mol["theorylevels"]: print(f"First level {first_level} not found in molecule {mol_id}.") continue if second_level not in mol["theorylevels"]: print(f"Second level {second_level} not found in molecule {mol_id}.") continue first_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][first_level][quantity]]) second_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][second_level][quantity]]) if len(first_data) != len(second_data): print(f"Data lengths do not match for {first_level} and {second_level} in collection {collection} for {mol_id}.") continue result += CompOp(first_data, second_data) count += len(first_data) if count == 0: print(f"No data found for {first_level} and {second_level} in collection {collection}.") continue result /= count all_comp[collection] = np.round(result, 4) if verbose: print(f"{comparetype} for {first_level} and {second_level} in collection {collection}: {np.round(result, 4)}") return all_comp
@staticmethod def _MSE(first_data, second_data): """ Calculate the squared error between two data sets. This is a helper function that does not calculate the mean. Parameters ---------- first_data : array-like The first data set. second_data : array-like The second data set. Returns ------- float The squared error between the two data sets. """ return ((first_data - second_data) ** 2).sum() @staticmethod def _MAE(first_data, second_data): """ Calculate the absolute error between two data sets. This is a helper function that does not calculate the mean. Parameters ---------- first_data : array-like The first data set. second_data : array-like The second data set. Returns ------- float The absolute error between the two data sets. """ return np.abs((first_data - second_data)).sum()