import os, sys
import h5py
from typing import List
from pathlib import Path
from pharmaforge.io.hdf5_utils import *
[docs]
def process_files_with_smiles(input_folder: str, output_folder: str, db) -> None:
"""
Process all HDF5 files in the input folder, adding SMILES notation to each.
Parameters
----------
input_folder : str
The folder containing the original HDF5 files.
output_folder : str
The folder to save the modified HDF5 files with SMILES notation.
db : DataBase
The database instance containing molecule data.
"""
output_folder = Path(output_folder)
if not output_folder.exists():
output_folder.mkdir(parents=True, exist_ok=True)
for file_name in os.listdir(input_folder):
if file_name.endswith(".hdf5"):
original_file = os.path.splitext(file_name)[0] # Get the file name without extension
new_file = os.path.join(output_folder, f"{original_file}_with_smiles.hdf5")
failed_mols = []
try:
with h5py.File(os.path.join(input_folder, file_name), 'r') as orig_db, h5py.File(new_file, 'w') as new_db:
for mol_name in orig_db.keys():
# Check if molecule exists in the original database
if mol_name not in db.data.get(original_file, {}):
print(f"Molecule '{mol_name}' not found in original database data")
continue
# Copy molecule data to the new file
orig_db.copy(mol_name, new_db)
save_molecule_to_xyz(db, original_file, mol_name)
# Generate SMILES notation for the molecule
try:
smiles = assign_smiles(mol_name, failed_mols)
except ValueError:
print(f"Skipping molecule '{mol_name}' due to an error in SMILES generation.")
failed_mols.append(mol_name)
continue
# Add SMILES notation to the new HDF5 file if successful
mol_group = new_db[mol_name]
if smiles is not None:
if "smiles" in mol_group:
print(f"Deleted existing SMILES notation for '{mol_name}' to add a new one")
del mol_group["smiles"]
mol_group.create_dataset("smiles", data=smiles.encode("utf-8"))
else:
print(f"Failed to generate SMILES for '{mol_name}', skipping SMILES addition.")
print(f"Modified HDF5 file with SMILES notation created: '{new_file}'")
if failed_mols:
print("Failed to process the following molecules:", failed_mols)
except FileNotFoundError:
print(f"File '{file_name}' not found in '{input_folder}'. Skipping.")
except Exception as e:
print(f"An error occurred while processing '{file_name}': {str(e)}")
if __name__ == "__main__":
import argparse
from pharmaforge.database import DataBase
# Set up argument parsing
parser = argparse.ArgumentParser(description="Process HDF5 files and add SMILES notation.")
parser.add_argument("input_folder", type=str, help="Folder containing the original HDF5 files.")
parser.add_argument("output_folder", type=str, help="Folder to save the modified HDF5 files with SMILES notation.")
args = parser.parse_args()
# Initialize the database instance
db = DataBase()
# Add data to the database from the input folder
for file_name in os.listdir(args.input_folder):
if file_name.endswith(".hdf5"):
db.add_data(os.path.join(args.input_folder, file_name))
# Process the files
process_files_with_smiles(args.input_folder, args.output_folder, db)