import h5py
import re, shutil
import os, sys
import numpy as np
from typing import Tuple, List
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
from pathlib import Path
from pharmaforge.database import DataBase
[docs]
def modify_molecule_name(mol_name: str) -> Tuple[str, List[str]]:
""" Modifies the molecule name by removing zero count elements and creating a type map.
Parameters
----------
mol_name : str
The original name of the molecule, which contains elements and their counts.
Returns
-------
Tuple[str, List[str]]
A tuple containing the modified molecule name and a list of element types.
"""
elements = re.findall(r'([A-Za-z]+)(\d+)', mol_name)
new_name = ''.join(f"{el}{count}" for el, count in elements if int(count) > 0)
type_map = [el for el, count in elements if int(count) > 0]
return new_name, type_map
[docs]
def modify_hdf5_file(original_file: str, new_file: str) -> None:
""" Modifies an HDF5 file by renaming molecules and creating a type map.
Parameters
----------
original_file : str
Path to the original HDF5 file.
new_file : str
Path to the new HDF5 file to be created.
Returns
-------
None
The function creates a new HDF5 file with modified molecule names and type maps.
Raises
------
FileNotFoundError
If the original file does not exist.
ValueError
If the original file is not an HDF5 file or if the original and new files are the same.
"""
original_file = Path(original_file)
if not original_file.is_file():
raise FileNotFoundError(f"Original file '{original_file}' does not exist.")
if original_file == new_file:
raise ValueError(f"Original file '{original_file}' and new file '{new_file}' are the same.")
with h5py.File(original_file, 'r') as orig_db, h5py.File(new_file, 'w') as new_db:
for mol_name in orig_db.keys():
new_name, type_map = modify_molecule_name(mol_name)
orig_db.copy(mol_name, new_db, name=new_name)
mol_group = new_db[new_name]
if 'type_map.raw' in mol_group:
del mol_group['type_map.raw']
mol_group.create_dataset('type_map.raw', data=type_map)
print(f"Modified HDF5 file created: {new_file}")
[docs]
def process_multiple_hdf5_files(files: List[str]) -> None:
""" Process multiple HDF5 files by modifying their contents and creating new files.
Parameters
----------
files : List[str]
A list of paths to the original HDF5 files to be processed.
Returns
-------
None
The function creates new HDF5 files with modified contents based on the original files.
Raises
------
None
The function handles file operations and exceptions internally.
"""
for original_file in files:
new_file = original_file.replace('.hdf5', '_modified.hdf5')
modify_hdf5_file(original_file, new_file)
[docs]
def save_molecule_to_xyz(db, db_name, mol, output_file='molecule_structure.xyz', config=0):
""" Saves a molecule's coordinates and types to a temporary XYZ file.
Parameters
----------
db : DataBase
An instance of the DataBase class containing the HDF5 data.
db_name : str
The key of the database to access.
mol : str
The name of the molecule to be saved.
output_file : str
The name of the output XYZ file.
config : int
The configuration option for the XYZ file.
Returns
-------
None
The function creates an XYZ file with the molecule's coordinates and types.
Raises
------
None
The function handles file operations and exceptions internally.
"""
# fetching molecule information
db_instance = db.data[db_name]
# Check if the molecule is in the database format OR if its in the deepmdkit-format
if "set.000" in db_instance[mol]:
a = db_instance[mol]['set.000']['coord.npy'][:]
else:
a = db_instance[mol]['coordinates'][:]
type_raw = db.data[db_name][mol]['type.raw'][:]
type_map = db.data[db_name][mol]['type_map.raw'][:]
coords = np.reshape(a[config], (-1, 3))
num_atoms = len(type_raw)
molecule_name = mol
with open(output_file, 'w') as file:
# Write the first two lines
file.write(f"{num_atoms}\n")
file.write(f"Molecule: {molecule_name}\n")
for i, atom_type_index in enumerate(type_raw):
atom_symbol = type_map[atom_type_index].decode("utf-8") # Decode bytes to string
atom_coords = coords[i]
file.write(f"{atom_symbol} {atom_coords[0]:.5f} {atom_coords[1]:.5f} {atom_coords[2]:.5f}\n")
print("Molecular structure saved to molecule_structure.xyz")
[docs]
def assign_smiles(molecule_name, use_suggested_charge=False, maxcount=2, xyz_file='molecule_structure.xyz'):
"""
Attempts to assign a SMILES notation to a molecule using its XYZ coordinates.
The function tries multiple possible formal charges to determine bonds and
generate a valid SMILES representation using RDKit. If all attempts fail,
the molecule name is added to the failed_mols list.
The use_suggested_charge option allows the function to use the suggested charge from the RDKit error message, which usually reads something like: `Final molecular charge (0) does not match input (-1); could not find valid bond ordering.` In this case, it would have originally tried charge 0, but then the next charge it will attempt is -1. The max_count option allows the user to set how many times the function will try this before giving up and not generating a SMILES at all.
.. warning::
This function, with the use_suggested_charge option, will try to assign the charge based on the error message from RDKit. Thus, it is important to implemennt some degree of skepticism with respect to whether the charges are actually the true charges or not.
Parameters
----------
molecule_name : str
The name of the molecule for which to generate SMILES.
failed_mols : list
A list to keep track of failed molecules.
use_suggested_charge : bool, optional
If True, use the suggested charge from the error message. Default is False.
maxcount : int, optional
The maximum number of attempts to generate SMILES with different charges. Only works when use_suggested_charge
is true. Default is 5. Otherwise, if use_suggested_charge is false, the function will only try charge 0.
xyz_file : str, optional
The path to the XYZ file containing the molecule's coordinates. Default is 'molecule_structure.xyz', which is the default for the save_molecule_to_xyz function.
Returns
-------
str or None
The SMILES notation for the molecule, or None if generation failed.
int or None
The charge used to generate the SMILES notation, or None if generation failed.
Raises
------
FileNotFoundError
If the XYZ file does not exist.
"""
xyz_file = Path(xyz_file)
if not xyz_file.is_file():
print(f"XYZ file '{xyz_file}' does not exist.")
raise FileNotFoundError(f"XYZ file '{xyz_file}' does not exist.")
raw_mol = Chem.rdmolfiles.MolFromXYZFile("molecule_structure.xyz")
if raw_mol is None:
print(f"Failed to load molecule '{molecule_name}' from XYZ file.")
return None
charge=0
stop=False
count=0
while not stop:
print("Trying charge:", charge)
mol = Chem.Mol(raw_mol) # Create a fresh copy each time
try:
rdDetermineBonds.DetermineBonds(mol, charge=charge)
smiles = Chem.MolToSmiles(mol)
print(f"SMILES notation for {molecule_name} generated with charge {charge}: {smiles}")
return smiles, charge # Return immediately on success
except Exception as e:
print(f"Charge {charge} failed for '{molecule_name}' with error: {e}")
if use_suggested_charge == True:
if "does not match input" not in str(e):
break
charge = int(str(e).split("input ")[-1].split(")")[0].split("(")[-1])
else:
print(f"Stop hit, stopping...")
stop=True
count += 1
if count >= maxcount:
print(f"Max attempts reached for molecule '{molecule_name}'.")
return None, None
continue # Try next charge
# If all charges fail
print(f"All attempts failed for molecule '{molecule_name}'.")
return None, None
# def assign_smiles(molecule_name, failed_mols):
# raw_mol = Chem.rdmolfiles.MolFromXYZFile("molecule_structure.xyz")
# if raw_mol is None:
# print(f"Failed to load molecule '{molecule_name}' from XYZ file.")
# failed_mols.append(molecule_name)
# return None
# mol = Chem.Mol(raw_mol)
# try:
# rdDetermineBonds.DetermineBonds(mol, charge=0)
# smiles = Chem.MolToSmiles(mol)
# print(f"SMILES notation for {molecule_name} is {smiles}")
# return smiles
# except Exception as e:
# print(f"Error determining bonds for molecule '{molecule_name}': {e}")
# failed_mols.append(molecule_name)
# return None
[docs]
def verify_all_shared_molecules(database_instance, db_key1, db_key2):
"""
Verify that all shared molecules between two databases have identical attributes,
including nested groups like 'set.000'.
Parameters
----------
database_instance : DataBase
An instance of the DataBase class containing the HDF5 data.
db_key1 : str
The key of the first database to compare.
db_key2 : str
The key of the second database to compare.
Returns
-------
None
Prints the matched and unmatched molecules between the two databases.
"""
# Step 1: Find shared molecules between the two databases
shared_molecules = database_instance.find_shared_molecules(db_key1, db_key2)
if not shared_molecules:
print(f"No shared molecules found between {db_key1} and {db_key2}.")
return
matched_mols = []
unmatched_mols = []
for mol_key in shared_molecules:
db1_molecule = database_instance.data[db_key1][mol_key]
db2_molecule = database_instance.data[db_key2][mol_key]
all_features_same = True # Flag to track if all features are identical
for feature in db1_molecule.keys():
if feature == 'set.000':
# Handle nested datasets in 'set.000'
for sub_feature in db1_molecule[feature].keys():
mol_data1 = db1_molecule[feature][sub_feature][:]
mol_data2 = db2_molecule[feature][sub_feature][:]
# Add detailed comparison debug output
if not np.array_equal(mol_data1, mol_data2):
all_features_same = False
break # Exit sub_feature comparison if there's a mismatch
elif feature == 'nopbc':
# Handle scalar attribute 'nopbc'
mol_data1 = db1_molecule[feature][()]
mol_data2 = db2_molecule[feature][()]
# Add detailed comparison debug output
if not np.array_equal(mol_data1, mol_data2):
all_features_same = False
break # Exit feature comparison if there's a mismatch
else:
mol_data1 = db1_molecule[feature][:]
mol_data2 = db2_molecule[feature][:]
if not np.array_equal(mol_data1, mol_data2):
all_features_same = False
break
if all_features_same:
matched_mols.append(mol_key)
else:
unmatched_mols.append(mol_key)
print("Matched molecules:", matched_mols)
print("Unmatched molecules:", unmatched_mols)
[docs]
def check_shared_molecules(database_instance):
""" Check which molecules are shared between all databases in the DataBase instance.
Parameters
----------
database_instance : DataBase
An instance of the DataBase class containing the HDF5 data.
Returns
-------
None
Prints the shared molecules between all databases.
"""
db_keys = list(database_instance.data.keys())
for i in range(len(db_keys)):
for j in range(i + 1, len(db_keys)):
db_key1 = db_keys[i]
db_key2 = db_keys[j]
shared_molecules = database_instance.find_shared_molecules(db_key1, db_key2)
if shared_molecules:
print(f"Databases {db_key1} and {db_key2} have {len(shared_molecules)} shared molecules: {shared_molecules}")
else:
print(f"Databases {db_key1} and {db_key2} do not have shared molecules.")
# This is a helper function which can be used when you don't have hdf5 files with smiles.
[docs]
def new_hdf5_file_with_smiles(original_file: str, new_file: str, exist_ok=False, use_suggested_charge=False, maxcount=5) -> None:
"""
Create a new HDF5 file with SMILES notation for each molecule. This function reads the original HDF5 file, generating a SMILES string for each molecule and saving it in the new file.
The function also handles the case where the original file and the new file are the same, and raises appropriate errors if the files do not exist or if the new file already exists.
Parameters
----------
original_file : str
Path to the original HDF5 file.
new_file : str
Path to the new HDF5 file to be created.
exist_ok : bool
If True, overwrite the existing file if it exists. Default is False.
use_suggested_charge : bool
If True, use the suggested charge from the error message. Default is False. (option for assign_charge)
maxcount : int
The maximum number of attempts to generate SMILES with different charges. Default is 5. (option for assign_charge)
Raises
------
FileNotFoundError
If the original file does not exist.
ValueError
If the original file is not an HDF5 file.
FileExistsError
If the new file already exists and exist_ok is False.
OSError
If the directory for the new file cannot be created.
See Also
--------
molecule_to_xyz : Function to save a molecule's coordinates and types to a temporary XYZ file.
assign_smiles : Function to generate SMILES notation for a molecule using its XYZ coordinates.
"""
original_file = Path(original_file)
if not original_file.is_file():
raise FileNotFoundError(f"Original file '{original_file}' does not exist.")
if not original_file.suffix == '.hdf5':
raise ValueError(f"Original file '{original_file}' is not an HDF5 file.")
if original_file == new_file:
raise ValueError(f"Original file '{original_file}' and new file '{new_file}' are the same.")
new_file = Path(new_file)
if new_file.is_file() and not exist_ok:
raise FileExistsError(f"New file '{new_file}' already exists. Please choose a different name.")
if new_file.suffix != '.hdf5':
raise ValueError(f"New file '{new_file}' is not an HDF5 file.")
# check that the path to the new file exists, and if not, make it
if not new_file.parent.is_dir():
try:
new_file.parent.mkdir(parents=True)
except Exception as e:
raise OSError(f"Could not create directory for new file '{new_file}': {e}")
failed_mols = []
db = DataBase()
db.add_data(original_file)
#original_filename = os.path.basename(original_file)
fstring = original_file.stem
with h5py.File(original_file, 'r') as orig_db, h5py.File(new_file, 'w') as new_db:
for mol_name in orig_db.keys():
# Check if molecule exists in the original database
if mol_name not in db.data.get(fstring, {}):
print(f"Molecule '{mol_name}' not found in original db data")
continue
# Copy molecule data to the new file
orig_db.copy(mol_name, new_db)
# Try configs of the molecule until a smiles is generated.
found = False
if found:
break
save_molecule_to_xyz(db, fstring, mol_name, config=0)
# Generate SMILES
try:
smiles, charge = assign_smiles(mol_name, use_suggested_charge=use_suggested_charge, maxcount=maxcount)
found = True
except ValueError as e:
print(f"{mol_name} failed due to an error in SMILES generation: {e}, moving to next config.")
found = False
continue
if not found or smiles is None:
print("Not found, trying openbabel...")
try:
print("Trying openbabel...")
from openbabel import openbabel
obConversion = openbabel.OBConversion()
obConversion.SetInAndOutFormats("xyz", "smi")
mol = openbabel.OBMol()
obConversion.ReadFile(mol, "molecule_structure.xyz")
smiles = str(obConversion.WriteString(mol))
charge = mol.GetTotalCharge()
except Exception as e:
print(f"Failed to generate SMILES for molecule '{mol_name}' in {fstring}: {e}")
shutil.copyfile("molecule_structure.xyz", f"{mol_name}_failed.xyz") # Copy the original XYZ file back
failed_mols.append(mol_name)
continue
if smiles is not None:
smiles = smiles.strip().split("\t")[0] # Clean up the SMILES string
print("SMILES generated:", smiles)
# Add SMILES notation to the new HDF5 file if successful
mol_group = new_db[mol_name]
if smiles is not None:
if "smiles" in mol_group:
print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one")
del mol_group["smiles"]
mol_group.create_dataset("smiles", data=smiles.encode("utf-8"))
mol_group.create_dataset("charge", data=charge)
else:
print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.")
print(f"Modified HDF5 file with SMILES notation created: '{new_file}'")
if failed_mols:
print("Failed to process the following molecules:", failed_mols)
if __name__ == "main":
hdf5_files = ['ani.hdf5', 'comp6.hdf5', 'freesolvmd.hdf5', 'geom.hdf5', 'spice.hdf5', 're.hdf5', 'remd.hdf5' ]
process_multiple_hdf5_files(hdf5_files)