Source code for pharmaforge.labeling.smiles

from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
import numpy as np
import h5py


[docs]
def save_molecule_to_xyz(db, db_name, mol):
    """ Saves the molecular structure to an XYZ file. 
    
    Parameters
    ----------
    db : DataBase
        The database instance containing molecule data.
    db_name : str
        The name of the database to connect to.
    mol : str
        The name of the molecule to save.
    
    """
    # fetching molecule information
    db_instance = db.data[db_name]
    a = db_instance[mol]['set.000']['coord.npy'][:]
    type_raw = db.data[db_name][mol]['type.raw'][:]
    type_map = db.data[db_name][mol]['type_map.raw'][:]
    coords = np.reshape(a[0], (-1, 3))
    num_atoms = len(type_raw)
    molecule_name = mol

    with open('molecule_structure.xyz', 'w') as file:
        # Write the first two lines
        file.write(f"{num_atoms}\n")
        file.write(f"Molecule: {molecule_name}\n")
        for i, atom_type_index in enumerate(type_raw):
            atom_symbol = type_map[atom_type_index].decode("utf-8")  # Decode bytes to string
            atom_coords = coords[i]

            file.write(f"{atom_symbol}   {atom_coords[0]:.5f} {atom_coords[1]:.5f} {atom_coords[2]:.5f}\n")

    print("Molecular structure saved to molecule_structure.xyz")



[docs]
def assign_smiles(molecule_name, failed_mols):
    """ Assigns SMILES notation to a molecule using RDKit.
    
    Parameters
    ----------
    molecule_name : str
        The name of the molecule for which to generate SMILES.
    failed_mols : list
        A list to keep track of failed molecules.

    Returns
    -------
    smiles : str
        The SMILES notation for the molecule, or None if generation failed.
    
    """
    raw_mol = Chem.rdmolfiles.MolFromXYZFile("molecule_structure.xyz")
    
    if raw_mol is None:
        print(f"Failed to load molecule '{molecule_name}' from XYZ file.")
        failed_mols.append(molecule_name)
        return None

    mol = Chem.Mol(raw_mol)
    try:
        rdDetermineBonds.DetermineBonds(mol, charge=0)
        smiles = Chem.MolToSmiles(mol)
        print(f"SMILES notation for {molecule_name} is {smiles}")
        return smiles
    except Exception as e:
        print(f"Error determining bonds for molecule '{molecule_name}': {e}")
        failed_mols.append(molecule_name)
        return None

    

    

[docs]
def new_hdf5_file_with_smiles(original_file: str, new_file: str, db) -> None:
    """Creates a new hdf5 file with smiles strings
    
    Parameters
    ----------
    original_file : str
        The name of the original HDF5 file to read from.
    new_file : str
        The name of the new HDF5 file to create.
    db : DataBase
        The database instance containing molecule data.
    
    """
    failed_mols = []

    with h5py.File(f"{original_file}.hdf5", 'r') as orig_db, h5py.File(f"{new_file}.hdf5", 'w') as new_db:
        for mol_name in orig_db.keys():
            # Check if molecule exists in the original database
            if mol_name not in db.data.get(original_file, {}):
                print(f"Molecule '{mol_name}' not found in original db data")
                continue
            
            # Copy molecule data to the new file
            orig_db.copy(mol_name, new_db)
            save_molecule_to_xyz(db, original_file, mol_name)
            
            # Generate SMILES notation for the molecule
            try:
                smiles = assign_smiles(mol_name, failed_mols)
            except ValueError:
                print(f"Skipping molecule '{mol_name}' due to an error in SMILES generation.")
                failed_mols.append(mol_name)
                continue
            
            # Add SMILES notation to the new HDF5 file if successful
            mol_group = new_db[mol_name]
            if smiles is not None:
                if "smiles" in mol_group:
                    print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one")
                    del mol_group["smiles"]
                mol_group.create_dataset("smiles", data=smiles.encode("utf-8"))
            else:
                print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.")

    print(f"Modified HDF5 file with SMILES notation created: '{new_file}.hdf5'")
    if failed_mols:
        print("Failed to process the following molecules:", failed_mols)




[docs]
def count_molecules(smiles):
    """ Counts the number of molecules in a SMILES string.
    
    Parameters
    ----------
    smiles : str
        The SMILES notation for the molecule.
    
    """
    if not smiles:
        print("Smiles notation for this molecule is not provided")
        return None
    molecules = smiles.split('.')
    return len(molecules)



[docs]
def count_water_molecules(smiles, mol_smile = "[H]O[H]"):
    """ Counts the number of water molecules in a SMILES string.
    
    Parameters
    ----------
    smiles : str
        The SMILES notation for the molecule.
    mol_smile : str
        The SMILES notation for the molecule to count (default is water).

    """
    if not smiles:
        print("Smiles notation for this molecule is not provided")
        return None
    
    molecules = smiles.split('.')
    water_count = molecules.count(mol_smile)
    return water_count