from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
import numpy as np
import h5py
[docs]
def save_molecule_to_xyz(db, db_name, mol):
""" Saves the molecular structure to an XYZ file.
Parameters
----------
db : DataBase
The database instance containing molecule data.
db_name : str
The name of the database to connect to.
mol : str
The name of the molecule to save.
"""
# fetching molecule information
db_instance = db.data[db_name]
a = db_instance[mol]['set.000']['coord.npy'][:]
type_raw = db.data[db_name][mol]['type.raw'][:]
type_map = db.data[db_name][mol]['type_map.raw'][:]
coords = np.reshape(a[0], (-1, 3))
num_atoms = len(type_raw)
molecule_name = mol
with open('molecule_structure.xyz', 'w') as file:
# Write the first two lines
file.write(f"{num_atoms}\n")
file.write(f"Molecule: {molecule_name}\n")
for i, atom_type_index in enumerate(type_raw):
atom_symbol = type_map[atom_type_index].decode("utf-8") # Decode bytes to string
atom_coords = coords[i]
file.write(f"{atom_symbol} {atom_coords[0]:.5f} {atom_coords[1]:.5f} {atom_coords[2]:.5f}\n")
print("Molecular structure saved to molecule_structure.xyz")
[docs]
def assign_smiles(molecule_name, failed_mols):
""" Assigns SMILES notation to a molecule using RDKit.
Parameters
----------
molecule_name : str
The name of the molecule for which to generate SMILES.
failed_mols : list
A list to keep track of failed molecules.
Returns
-------
smiles : str
The SMILES notation for the molecule, or None if generation failed.
"""
raw_mol = Chem.rdmolfiles.MolFromXYZFile("molecule_structure.xyz")
if raw_mol is None:
print(f"Failed to load molecule '{molecule_name}' from XYZ file.")
failed_mols.append(molecule_name)
return None
mol = Chem.Mol(raw_mol)
try:
rdDetermineBonds.DetermineBonds(mol, charge=0)
smiles = Chem.MolToSmiles(mol)
print(f"SMILES notation for {molecule_name} is {smiles}")
return smiles
except Exception as e:
print(f"Error determining bonds for molecule '{molecule_name}': {e}")
failed_mols.append(molecule_name)
return None
[docs]
def new_hdf5_file_with_smiles(original_file: str, new_file: str, db) -> None:
"""Creates a new hdf5 file with smiles strings
Parameters
----------
original_file : str
The name of the original HDF5 file to read from.
new_file : str
The name of the new HDF5 file to create.
db : DataBase
The database instance containing molecule data.
"""
failed_mols = []
with h5py.File(f"{original_file}.hdf5", 'r') as orig_db, h5py.File(f"{new_file}.hdf5", 'w') as new_db:
for mol_name in orig_db.keys():
# Check if molecule exists in the original database
if mol_name not in db.data.get(original_file, {}):
print(f"Molecule '{mol_name}' not found in original db data")
continue
# Copy molecule data to the new file
orig_db.copy(mol_name, new_db)
save_molecule_to_xyz(db, original_file, mol_name)
# Generate SMILES notation for the molecule
try:
smiles = assign_smiles(mol_name, failed_mols)
except ValueError:
print(f"Skipping molecule '{mol_name}' due to an error in SMILES generation.")
failed_mols.append(mol_name)
continue
# Add SMILES notation to the new HDF5 file if successful
mol_group = new_db[mol_name]
if smiles is not None:
if "smiles" in mol_group:
print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one")
del mol_group["smiles"]
mol_group.create_dataset("smiles", data=smiles.encode("utf-8"))
else:
print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.")
print(f"Modified HDF5 file with SMILES notation created: '{new_file}.hdf5'")
if failed_mols:
print("Failed to process the following molecules:", failed_mols)
[docs]
def count_molecules(smiles):
""" Counts the number of molecules in a SMILES string.
Parameters
----------
smiles : str
The SMILES notation for the molecule.
"""
if not smiles:
print("Smiles notation for this molecule is not provided")
return None
molecules = smiles.split('.')
return len(molecules)
[docs]
def count_water_molecules(smiles, mol_smile = "[H]O[H]"):
""" Counts the number of water molecules in a SMILES string.
Parameters
----------
smiles : str
The SMILES notation for the molecule.
mol_smile : str
The SMILES notation for the molecule to count (default is water).
"""
if not smiles:
print("Smiles notation for this molecule is not provided")
return None
molecules = smiles.split('.')
water_count = molecules.count(mol_smile)
return water_count