Source code for pharmaforge.labeling.smiles

from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
import numpy as np
import h5py

[docs] def save_molecule_to_xyz(db, db_name, mol): """ Saves the molecular structure to an XYZ file. Parameters ---------- db : DataBase The database instance containing molecule data. db_name : str The name of the database to connect to. mol : str The name of the molecule to save. """ # fetching molecule information db_instance = db.data[db_name] a = db_instance[mol]['set.000']['coord.npy'][:] type_raw = db.data[db_name][mol]['type.raw'][:] type_map = db.data[db_name][mol]['type_map.raw'][:] coords = np.reshape(a[0], (-1, 3)) num_atoms = len(type_raw) molecule_name = mol with open('molecule_structure.xyz', 'w') as file: # Write the first two lines file.write(f"{num_atoms}\n") file.write(f"Molecule: {molecule_name}\n") for i, atom_type_index in enumerate(type_raw): atom_symbol = type_map[atom_type_index].decode("utf-8") # Decode bytes to string atom_coords = coords[i] file.write(f"{atom_symbol} {atom_coords[0]:.5f} {atom_coords[1]:.5f} {atom_coords[2]:.5f}\n") print("Molecular structure saved to molecule_structure.xyz")
[docs] def assign_smiles(molecule_name, failed_mols): """ Assigns SMILES notation to a molecule using RDKit. Parameters ---------- molecule_name : str The name of the molecule for which to generate SMILES. failed_mols : list A list to keep track of failed molecules. Returns ------- smiles : str The SMILES notation for the molecule, or None if generation failed. """ raw_mol = Chem.rdmolfiles.MolFromXYZFile("molecule_structure.xyz") if raw_mol is None: print(f"Failed to load molecule '{molecule_name}' from XYZ file.") failed_mols.append(molecule_name) return None mol = Chem.Mol(raw_mol) try: rdDetermineBonds.DetermineBonds(mol, charge=0) smiles = Chem.MolToSmiles(mol) print(f"SMILES notation for {molecule_name} is {smiles}") return smiles except Exception as e: print(f"Error determining bonds for molecule '{molecule_name}': {e}") failed_mols.append(molecule_name) return None
[docs] def new_hdf5_file_with_smiles(original_file: str, new_file: str, db) -> None: """Creates a new hdf5 file with smiles strings Parameters ---------- original_file : str The name of the original HDF5 file to read from. new_file : str The name of the new HDF5 file to create. db : DataBase The database instance containing molecule data. """ failed_mols = [] with h5py.File(f"{original_file}.hdf5", 'r') as orig_db, h5py.File(f"{new_file}.hdf5", 'w') as new_db: for mol_name in orig_db.keys(): # Check if molecule exists in the original database if mol_name not in db.data.get(original_file, {}): print(f"Molecule '{mol_name}' not found in original db data") continue # Copy molecule data to the new file orig_db.copy(mol_name, new_db) save_molecule_to_xyz(db, original_file, mol_name) # Generate SMILES notation for the molecule try: smiles = assign_smiles(mol_name, failed_mols) except ValueError: print(f"Skipping molecule '{mol_name}' due to an error in SMILES generation.") failed_mols.append(mol_name) continue # Add SMILES notation to the new HDF5 file if successful mol_group = new_db[mol_name] if smiles is not None: if "smiles" in mol_group: print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one") del mol_group["smiles"] mol_group.create_dataset("smiles", data=smiles.encode("utf-8")) else: print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.") print(f"Modified HDF5 file with SMILES notation created: '{new_file}.hdf5'") if failed_mols: print("Failed to process the following molecules:", failed_mols)
[docs] def count_molecules(smiles): """ Counts the number of molecules in a SMILES string. Parameters ---------- smiles : str The SMILES notation for the molecule. """ if not smiles: print("Smiles notation for this molecule is not provided") return None molecules = smiles.split('.') return len(molecules)
[docs] def count_water_molecules(smiles, mol_smile = "[H]O[H]"): """ Counts the number of water molecules in a SMILES string. Parameters ---------- smiles : str The SMILES notation for the molecule. mol_smile : str The SMILES notation for the molecule to count (default is water). """ if not smiles: print("Smiles notation for this molecule is not provided") return None molecules = smiles.split('.') water_count = molecules.count(mol_smile) return water_count