from pymongo import MongoClient
from pathlib import Path
from pprint import pprint
from pharmaforge.database import DataBase
from pharmaforge.dbutils.create_mongodb_collections import process_hdf5_folder
from pharmaforge.dbutils.mongo_utils import add_field_to_all_documents
[docs]
class GeneralRecipe:
""" A class to represent a recipe for generating a database of molecules
Parameters
----------
database_name : str
The name of the database to create
client : MongoClient
The MongoDB client to use
input_dir : str
The directory containing the HDF5 files to process
kwargs : dict
Additional keyword arguments to pass to the recipe. These are one-for-one keypairs that
get added to ALL the database entries.
"""
def __init__(self, database_name, client, input_dir=None, **kwargs):
self.kwargs = kwargs
self.database_name = database_name
self.open_client(client)
if input_dir is not None:
self.process_hdf5(input_dir)
else:
# Check if the database exists
self.db = self.client[self.database_name]
self.check_collections(verbose=True)
self.add_field_from_kwargs()
return
[docs]
def open_client(self, client):
""" Add the recipe to the client
Parameters
----------
client : MongoClient
The MongoDB client to add the recipe to
Returns
-------
None
"""
self.client = MongoClient(client)
return
[docs]
def check_collections(self, verbose=False):
collections = self.client[self.database_name].list_collection_names()
if len(collections) == 0:
print(f"No collections found in database {self.database_name}.")
else:
if verbose: print(f"Collections in database {self.database_name}:")
for collection in collections:
if verbose: print(f"- {collection}")
return collections
[docs]
def process_hdf5(self, input_dir):
""" Process the HDF5 files in the input directory
Parameters
----------
input_dir : str
The directory containing the HDF5 files to process
Returns
-------
None
Raises
------
FileNotFoundError
If the input directory does not exist
ValueError
If the level_of_theory or data_source is not specified
"""
self.clear_db()
self.input_dir = Path(input_dir)
if not self.input_dir.exists():
raise FileNotFoundError(f"Input directory {self.input_dir} does not exist")
if not self.kwargs.get("level_of_theory"):
raise ValueError("level_of_theory must be specified")
if not self.kwargs.get("data_source"):
raise ValueError("data_source must be specified")
process_hdf5_folder(
folder_path=f"{self.input_dir}",
database_name=self.database_name,
level_of_theory=self.kwargs.get("level_of_theory", "wB97M-D3(BJ)/def2-TZVPPD"),
data_source=self.kwargs.get("data_source", "generic"),
)
self.db = self.client[self.database_name]
return
[docs]
def clear_db(self):
""" Clear the database
Parameters
----------
database_name : str
The name of the database to clear. If None, the current database is cleared.
Returns
-------
None
"""
self.db = self.client[self.database_name]
for key in self.db.list_collection_names():
self.db.drop_collection(key)
return
[docs]
def add_field(self, collection, field_name, field_value):
""" Add a field to the recipe
Parameters
----------
field_name : str
The name of the field to add
field_value : any
The value of the field to add
Returns
-------
None
"""
collections = self.check_collections()
if collection not in collections:
raise ValueError(f"Collection {collection} does not exist")
return add_field_to_all_documents(self.database_name, collection, field_name, field_value)
[docs]
def add_field_from_kwargs(self):
""" Add a field to the recipe from the kwargs
Parameters
----------
field_name : str
The name of the field to add
field_value : any
The value of the field to add
Returns
-------
None
"""
for collection in self.check_collections():
for key, value in self.kwargs.items():
self.add_field(collection, key, value)
return
[docs]
def pprint_one_entry(self):
""" Pretty print one entry from the database
Parameters
----------
None
Returns
-------
None
"""
collections = self.check_collections()
for collection in collections:
entry = self.db[collection].find_one()
print(f"Collection: {collection}")
pprint(entry)