import numpy as np
from pharmaforge.database import Loader
from pharmaforge.queries.query import Query
[docs]
class Comparator:
"""
A class to do operations on data stored in the database, such as comparing data from different levels of theory.
Attributes
----------
client : pharmaforge.database.Loader
The MongoDB client object used to connect to the database.
Parameters
----------
client_addr : str, optional
The address of the MongoDB client. Default is None, which uses the default address "mongodb://localhost:27017".
database_name : str, optional
The name of the MongoDB database. Default is None.
collection_name : str, optional
The name of the MongoDB collection. Default is None.
"""
def __init__(self, client_addr=None, database_name=None, collection_name=None):
self.client = Loader(client_addr=client_addr,
database_name=database_name,
collection_name=collection_name)
[docs]
def calculate(self, first_level, second_level, quantity="energies", query="nmols gt 0", comparetype="MAE", verbose=False):
"""
Calculate the mean absolute error or mean squared error between two levels of theory for a given quantity.
This function compares the data from two different levels of theory for a given quantity (e.g., eneries or forces) and then calculates the
mean absolute error (MAE) or mean squared error (MSE) between them. The comparison is done for all molecules in the database that match the given query,
assuming that both molecules have the same data present (some molecules may not have data for both levels of theory, or SCF may not have converged during
the relabeling of data.
Parameters
----------
first_level : str
The first level of theory.
second_level : str
The second level of theory.
quantity : str, optional
The quantity to compare. Default is "energies". Other option is "forces".
query : str, optional
The query to filter the data. Default is "nmols gt 0". This is a MongoDB query string.
Returns
-------
float
The mean absolute error between the two levels of theory for the given quantity.
Raises
------
ValueError
If the database is not found or if the query is invalid.
Example
-------
>>> from pharmaforge.stats.comparator import Comparator
>>> compare = Comparator(
... client_addr="mongodb://localhost:27017/",
... database_name="QDPi1_Database",
... )
>>> results = compare.calculate(
... first_level="wB97X/6-31G*-DFTB3",
... second_level="QDPi1",
... quantity="forces",
... query="nmols gt 0",
... comparetype="MAE"
... )
"""
CompOp = None
if comparetype == "MAE":
CompOp = Comparator._MAE
elif comparetype == "MSE":
CompOp = Comparator._MSE
if self.client.db is None:
raise ValueError("Database not found. Please check the database name.")
q = Query(query)
all_comp = {}
for collection in self.client.collections:
result = 0.0
count = 0
results = q.apply(self.client.db, collection, verbose=verbose)
if results is None:
continue
for mol in results:
mol_id = mol.get("molecule_id")
if first_level not in mol["theorylevels"]:
print(f"First level {first_level} not found in molecule {mol_id}.")
continue
if second_level not in mol["theorylevels"]:
print(f"Second level {second_level} not found in molecule {mol_id}.")
continue
first_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][first_level][quantity]])
second_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][second_level][quantity]])
if len(first_data) != len(second_data):
print(f"Data lengths do not match for {first_level} and {second_level} in collection {collection} for {mol_id}.")
continue
result += CompOp(first_data, second_data)
count += len(first_data)
if count == 0:
print(f"No data found for {first_level} and {second_level} in collection {collection}.")
continue
result /= count
all_comp[collection] = np.round(result, 4)
if verbose:
print(f"{comparetype} for {first_level} and {second_level} in collection {collection}: {np.round(result, 4)}")
return all_comp
@staticmethod
def _MSE(first_data, second_data):
"""
Calculate the squared error between two data sets. This is a helper function that does not calculate the mean.
Parameters
----------
first_data : array-like
The first data set.
second_data : array-like
The second data set.
Returns
-------
float
The squared error between the two data sets.
"""
return ((first_data - second_data) ** 2).sum()
@staticmethod
def _MAE(first_data, second_data):
"""
Calculate the absolute error between two data sets. This is a helper function that does not calculate the mean.
Parameters
----------
first_data : array-like
The first data set.
second_data : array-like
The second data set.
Returns
-------
float
The absolute error between the two data sets.
"""
return np.abs((first_data - second_data)).sum()