Source code for pharmaforge.stats.comparator

import numpy as np

from pharmaforge.database import Loader
from pharmaforge.queries.query import Query


[docs]
class Comparator:
    """
    A class to do operations on data stored in the database, such as comparing data from different levels of theory.
    
    Attributes
    ----------
    client : pharmaforge.database.Loader
        The MongoDB client object used to connect to the database.

    Parameters
    ----------
    client_addr : str, optional
        The address of the MongoDB client. Default is None, which uses the default address "mongodb://localhost:27017".
    database_name : str, optional
        The name of the MongoDB database. Default is None.
    collection_name : str, optional
        The name of the MongoDB collection. Default is None.
    
    """
    def __init__(self, client_addr=None, database_name=None, collection_name=None):
        self.client = Loader(client_addr=client_addr, 
                             database_name=database_name, 
                             collection_name=collection_name)

    

[docs]
    def calculate(self, first_level, second_level, quantity="energies", query="nmols gt 0", comparetype="MAE", verbose=False):
        """
        Calculate the mean absolute error or mean squared error between two levels of theory for a given quantity.

        This function compares the data from two different levels of theory for a given quantity (e.g., eneries or forces) and then calculates the 
        mean absolute error (MAE) or mean squared error (MSE) between them. The comparison is done for all molecules in the database that match the given query,
        assuming that both molecules have the same data present (some molecules may not have data for both levels of theory, or SCF may not have converged during
        the relabeling of data.

        Parameters
        ----------
        first_level : str
            The first level of theory.
        second_level : str
            The second level of theory.
        quantity : str, optional
            The quantity to compare. Default is "energies". Other option is "forces".
        query : str, optional
            The query to filter the data. Default is "nmols gt 0". This is a MongoDB query string.


        Returns
        -------
        float
            The mean absolute error between the two levels of theory for the given quantity.

        Raises
        ------
        ValueError
            If the database is not found or if the query is invalid.

        Example
        -------
        >>> from pharmaforge.stats.comparator import Comparator
        >>> compare = Comparator(
        ...     client_addr="mongodb://localhost:27017/",
        ...     database_name="QDPi1_Database",
        ... )
        >>> results = compare.calculate(
        ...     first_level="wB97X/6-31G*-DFTB3",
        ...     second_level="QDPi1",
        ...     quantity="forces",
        ...     query="nmols gt 0",
        ...     comparetype="MAE"
        ... )
        
        """

        CompOp = None
        if comparetype == "MAE":
            CompOp = Comparator._MAE
        elif comparetype == "MSE":
            CompOp = Comparator._MSE
        
        if self.client.db is None:
            raise ValueError("Database not found. Please check the database name.")
        
        q = Query(query)
        all_comp = {}
        for collection in self.client.collections:
            result = 0.0
            count = 0
            results = q.apply(self.client.db, collection, verbose=verbose)
            if results is None:
                continue
            for mol in results:
                mol_id = mol.get("molecule_id")
                if first_level not in mol["theorylevels"]:
                    print(f"First level {first_level} not found in molecule {mol_id}.")
                    continue
                if second_level not in mol["theorylevels"]:
                    print(f"Second level {second_level} not found in molecule {mol_id}.")
                    continue
                first_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][first_level][quantity]])
                second_data = np.concatenate([np.array(x).flatten() for x in mol["theorylevels"][second_level][quantity]])
                if len(first_data) != len(second_data):
                    print(f"Data lengths do not match for {first_level} and {second_level} in collection {collection} for {mol_id}.")
                    continue
                result += CompOp(first_data, second_data)
                count += len(first_data)
            if count == 0:
                print(f"No data found for {first_level} and {second_level} in collection {collection}.")
                continue
            result /= count
            all_comp[collection] = np.round(result, 4)
            if verbose: 
                print(f"{comparetype} for {first_level} and {second_level} in collection {collection}: {np.round(result, 4)}")
        return all_comp

    
    @staticmethod
    def _MSE(first_data, second_data):
        """
        Calculate the squared error between two data sets. This is a helper function that does not calculate the mean.

        Parameters
        ----------
        first_data : array-like
            The first data set.
        second_data : array-like
            The second data set.

        Returns
        -------
        float
            The squared error between the two data sets.
        
        """
        return ((first_data - second_data) ** 2).sum()



    
    @staticmethod
    def _MAE(first_data, second_data):
        """
        Calculate the absolute error between two data sets. This is a helper function that does not calculate the mean.

        Parameters
        ----------
        first_data : array-like
            The first data set.
        second_data : array-like
            The second data set.

        Returns
        -------
        float
            The absolute error between the two data sets.
        
        """
        return np.abs((first_data - second_data)).sum()