Source code for pharmaforge.labeling.get_smiles

import os, sys
import h5py
from typing import List
from pathlib import Path


from pharmaforge.io.hdf5_utils import *

[docs] def process_files_with_smiles(input_folder: str, output_folder: str, db) -> None: """ Process all HDF5 files in the input folder, adding SMILES notation to each. Parameters ---------- input_folder : str The folder containing the original HDF5 files. output_folder : str The folder to save the modified HDF5 files with SMILES notation. db : DataBase The database instance containing molecule data. """ output_folder = Path(output_folder) if not output_folder.exists(): output_folder.mkdir(parents=True, exist_ok=True) for file_name in os.listdir(input_folder): if file_name.endswith(".hdf5"): original_file = os.path.splitext(file_name)[0] # Get the file name without extension new_file = os.path.join(output_folder, f"{original_file}_with_smiles.hdf5") failed_mols = [] try: with h5py.File(os.path.join(input_folder, file_name), 'r') as orig_db, h5py.File(new_file, 'w') as new_db: for mol_name in orig_db.keys(): # Check if molecule exists in the original database if mol_name not in db.data.get(original_file, {}): print(f"Molecule '{mol_name}' not found in original database data") continue # Copy molecule data to the new file orig_db.copy(mol_name, new_db) save_molecule_to_xyz(db, original_file, mol_name) # Generate SMILES notation for the molecule try: smiles = assign_smiles(mol_name, failed_mols) except ValueError: print(f"Skipping molecule '{mol_name}' due to an error in SMILES generation.") failed_mols.append(mol_name) continue # Add SMILES notation to the new HDF5 file if successful mol_group = new_db[mol_name] if smiles is not None: if "smiles" in mol_group: print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one") del mol_group["smiles"] mol_group.create_dataset("smiles", data=smiles.encode("utf-8")) else: print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.") print(f"Modified HDF5 file with SMILES notation created: '{new_file}'") if failed_mols: print("Failed to process the following molecules:", failed_mols) except FileNotFoundError: print(f"File '{file_name}' not found in '{input_folder}'. Skipping.") except Exception as e: print(f"An error occurred while processing '{file_name}': {str(e)}")
if __name__ == "__main__": import argparse from pharmaforge.database import DataBase # Set up argument parsing parser = argparse.ArgumentParser(description="Process HDF5 files and add SMILES notation.") parser.add_argument("input_folder", type=str, help="Folder containing the original HDF5 files.") parser.add_argument("output_folder", type=str, help="Folder to save the modified HDF5 files with SMILES notation.") args = parser.parse_args() # Initialize the database instance db = DataBase() # Add data to the database from the input folder for file_name in os.listdir(args.input_folder): if file_name.endswith(".hdf5"): db.add_data(os.path.join(args.input_folder, file_name)) # Process the files process_files_with_smiles(args.input_folder, args.output_folder, db)