Source code for pharmaforge.recipes.GeneralDatabase

from pymongo import MongoClient
from pathlib import Path
from pprint import pprint

from pharmaforge.database import DataBase
from pharmaforge.dbutils.create_mongodb_collections import process_hdf5_folder
from pharmaforge.dbutils.mongo_utils import add_field_to_all_documents


[docs] class GeneralRecipe: """ A class to represent a recipe for generating a database of molecules Parameters ---------- database_name : str The name of the database to create client : MongoClient The MongoDB client to use input_dir : str The directory containing the HDF5 files to process kwargs : dict Additional keyword arguments to pass to the recipe. These are one-for-one keypairs that get added to ALL the database entries. """ def __init__(self, database_name, client, input_dir=None, **kwargs): self.kwargs = kwargs self.database_name = database_name self.open_client(client) if input_dir is not None: self.process_hdf5(input_dir) else: # Check if the database exists self.db = self.client[self.database_name] self.check_collections(verbose=True) self.add_field_from_kwargs() return
[docs] def open_client(self, client): """ Add the recipe to the client Parameters ---------- client : MongoClient The MongoDB client to add the recipe to Returns ------- None """ self.client = MongoClient(client) return
[docs] def check_collections(self, verbose=False): collections = self.client[self.database_name].list_collection_names() if len(collections) == 0: print(f"No collections found in database {self.database_name}.") else: if verbose: print(f"Collections in database {self.database_name}:") for collection in collections: if verbose: print(f"- {collection}") return collections
[docs] def process_hdf5(self, input_dir): """ Process the HDF5 files in the input directory Parameters ---------- input_dir : str The directory containing the HDF5 files to process Returns ------- None Raises ------ FileNotFoundError If the input directory does not exist ValueError If the level_of_theory or data_source is not specified """ self.clear_db() self.input_dir = Path(input_dir) if not self.input_dir.exists(): raise FileNotFoundError(f"Input directory {self.input_dir} does not exist") if not self.kwargs.get("level_of_theory"): raise ValueError("level_of_theory must be specified") if not self.kwargs.get("data_source"): raise ValueError("data_source must be specified") process_hdf5_folder( folder_path=f"{self.input_dir}", database_name=self.database_name, level_of_theory=self.kwargs.get("level_of_theory", "wB97M-D3(BJ)/def2-TZVPPD"), data_source=self.kwargs.get("data_source", "generic"), ) self.db = self.client[self.database_name] return
[docs] def clear_db(self): """ Clear the database Parameters ---------- database_name : str The name of the database to clear. If None, the current database is cleared. Returns ------- None """ self.db = self.client[self.database_name] for key in self.db.list_collection_names(): self.db.drop_collection(key) return
[docs] def add_field(self, collection, field_name, field_value): """ Add a field to the recipe Parameters ---------- field_name : str The name of the field to add field_value : any The value of the field to add Returns ------- None """ collections = self.check_collections() if collection not in collections: raise ValueError(f"Collection {collection} does not exist") return add_field_to_all_documents(self.database_name, collection, field_name, field_value)
[docs] def add_field_from_kwargs(self): """ Add a field to the recipe from the kwargs Parameters ---------- field_name : str The name of the field to add field_value : any The value of the field to add Returns ------- None """ for collection in self.check_collections(): for key, value in self.kwargs.items(): self.add_field(collection, key, value) return
[docs] def pprint_one_entry(self): """ Pretty print one entry from the database Parameters ---------- None Returns ------- None """ collections = self.check_collections() for collection in collections: entry = self.db[collection].find_one() print(f"Collection: {collection}") pprint(entry)