commit 52145ecabfdec9d5e3165aa985c551085fec017c Author: rnsrk Date: Wed Feb 7 15:33:41 2024 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8707e41 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +kram +docs +schemas +test-docs +test-schemas +database.db +test.db +venv +.idea +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..07455bb --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# Good Bye HIDA +Small script to transform XML Documents of the HIDA/MIDAS architecture to a sqlite database. + +## Prerequisites +create a virtual environment: +```bash +python3 -m venv venv +``` +activate the virtual environment: +```bash +source venv/bin/activate +``` +install requirements: +```bash +pip install -r requirements.txt +``` +place the XML files in the `docs` folder or for evaluation purposes few files in the `test-docs` folder. + +## Usage +To have a test run, place XML-files in a dir named `test-docs`, then type +```bash +python3 goodByeHida.py --buildSchemas True +``` +You will get a dir `test-schemas` and a sqlite database `test.db` with the imported data. + +If everything looks good you can run the script with the `docs` folder: +```bash +python3 goodByeHida.py --production True --buildSchemas True +``` +You will get a dir `schemas` and a sqlite database `databse.db` with the imported data. + +If you like to restart the process and delete the database, type: +```bash +python3 goodByeHida.py --production True --buildSchemas True --deleteDatabase True +``` diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..9c2cbc8 --- /dev/null +++ b/__init__.py @@ -0,0 +1,5 @@ +"""Top-Level package for the project.""" +# __init__.py + +__app_name__ = "rptodo" +__version__ = "0.1.0" \ No newline at end of file diff --git a/buildSchemas.py b/buildSchemas.py new file mode 100644 index 0000000..36ea7de --- /dev/null +++ b/buildSchemas.py @@ -0,0 +1,170 @@ +import os +import json +from utils import cleanEntityName +import xml.etree.ElementTree as ET +import shutil +from tqdm import tqdm + + +def processNode(node, schemaDir: str, parentName: str = None) -> set: + """ Process a node. + + Args: + node (Element): The node to process. + schemaDir (str): The path to the directory to store the schemas. + parentName (str, optional): The name of the parent node. Defaults to None. + + Returns: + set: The set of columns. + """ + + if node.tag == "block": + # If the node is a block, it is the root node. + key_lbl: str = cleanEntityName(f"{node.get('txt')}") # The name of the column. + columns: set = set([]) # The set of columns. + else: + # If the node is not a block, it is a child node. + key_lbl: str = cleanEntityName(f"{node.get('key')}_{node.get('lbl')}") # The name of the column. + columns: set = {f"f__{key_lbl}"} # The set of columns with its own name, cause it has children. + for child in node: + if len(child) > 0: + # If the child node has children, process the child node. + processNode(child, schemaDir, key_lbl) # The columns of the child node. + elif 'txt' in child.attrib: + # If the child node has a text attribute, we need no column. + createRelTable(schemaDir, key_lbl, cleanEntityName(f"{child.get('key')}_{child.get('lbl')}")) + childName: str = cleanEntityName(f"{child.get('key')}_{child.get('lbl')}") # The name of the child column. + + childColumns = set([f"f__uuid", f"f__{childName}"]) + filePathEntity: str = os.path.join(schemaDir, f"c__{childName}.json") + if os.path.exists(filePathEntity): + # If the entity file exists, load the existing columns. + with open(filePathEntity, 'r', encoding='utf-8') as f: + # Load the existing columns from the entity file. + existingChildColumns: list = json.load(f).get("columns", []) + childColumns.update(existingChildColumns) + with open(filePathEntity, 'w', encoding='utf-8') as f: + # Open the entity file to write. + + # Write the entity file with the columns. + json.dump({"name": f"c__{childName}", "columns": list(childColumns)}, f, ensure_ascii=False) + else: + # Iterate through the children of the node. + childName: str = cleanEntityName(f"{child.get('key')}_{child.get('lbl')}") # The name of the child column. + # Add the child column to the set of columns. + columns.add(f"f__{childName}") + + if columns and len(node) > 0: + # Check if the node has children + + columnsList: list = sorted(list(columns)) # Sorted list of the columns. + + # Add the uuid column to the list of columns + columnsList.append("f__uuid") + filePathEntity: str = os.path.join(schemaDir, f"c__{key_lbl}.json") # The path to the entity file. + if os.path.exists(filePathEntity): + # If the entity file exists, load the existing columns. + with open(filePathEntity, 'r', encoding='utf-8') as f: + # Load the existing columns from the entity file. + existingColumns: list = json.load(f).get("columns", []) # The existing columns. + + # Add the existing columns to the list of columns. + columnsList.extend(existingColumns) + + # Remove duplicates + columnsList = sorted(list(set(columnsList))) + + with open(filePathEntity, 'w', encoding='utf-8') as f: + # Open the entity file to write. + + # Write the entity file with the columns. + json.dump({"name": f"c__{key_lbl}", "columns": columnsList}, f, ensure_ascii=False) + if parentName: + # If the node has a parent, create a relationship table. + createRelTable(schemaDir, parentName, key_lbl) + return columns + + +def processXmlFile(filePath, schemaDir): + """Process an XML file. + + Args: + filePath (str): The path to the XML file. + schemaDir (str): The path to the directory to store the schemas. + """ + tree = ET.parse(filePath) # The XML tree. + root = tree.getroot() # The root of the XML tree. + + os.makedirs(schemaDir, exist_ok=True) + + for block in root.iter('block'): + # Iterate through the blocks in the XML file and process each block. + if 'txt' in block.attrib: + # If the block has a text attribute, process the block. + columns: set = processNode(block, schemaDir) # The columns of the block. + columnsList: list = sorted(list(columns)) # Sorted list of the columns. + filePath: str = os.path.join(schemaDir, f"c__{block.get('txt')}.json") # The path to the file. + + if os.path.exists(filePath): + # If the file exists, load the existing columns. + with open(filePath, 'r', encoding='utf-8') as f: + # Load the existing columns from the file. + existingColumns: list = json.load(f).get("columns", []) # The existing columns. + + # Add the existing columns to the list of columns. + columnsList.extend(existingColumns) + + # Remove duplicates from the list of columns. + columnsList = sorted(list(set(columnsList))) + + with open(filePath, 'w', encoding='utf-8') as f: + # Open the file to write. + + # Write the file with the columns. + json.dump({"name": f"c__{block.get('txt')}", "columns": columnsList}, f, ensure_ascii=False) + + +def buildSchemas(dirPath, schemaDir): + """Parse schemas from XML files and saves them as json. + + Args: + dirPath (str): The path to the directory containing the XML files. + schemaDir (str): The path to the directory to store the schemas. + """ + if os.path.exists(schemaDir): + # Remove the existing schema directory + shutil.rmtree(schemaDir) + + # Get the total number of XML files + totalFiles = sum([len([f for f in files if f.endswith('.xml')]) for r, d, files in os.walk(dirPath)]) + + with tqdm(total=totalFiles, desc="Processing XML files", ncols=75) as pbar: + for dirpath, dirnames, filenames in os.walk(dirPath): + # Walk through the directory and process each XML file + for fileName in filenames: + if fileName.endswith('.xml'): + processXmlFile(os.path.join(dirpath, fileName), schemaDir) + # Update the progress bar + pbar.update(1) + print('Schemas built.') + + +def createRelTable(schemaDir: str, parentName: str, key_lbl: str): + """Create a relationship table. + Args: + schemaDir (str): The path to the directory to store the schemas. + parentName (str): The name of the parent node. + key_lbl (str): The name of the column. + """ + tableName = f"r__{parentName}__{key_lbl}" + filePathRelTable: str = os.path.join(schemaDir, + f"{tableName}.json" + ) # The path to the relationship table file. + + with open(filePathRelTable, 'w', encoding='utf-8') as f: + # Open the relationship table file to write. + + # Write the relationship table file with the columns. + json.dump( + {"name": tableName, "columns": [f"f__{parentName}__uuid", f"f__{key_lbl}__uuid"]}, + f, ensure_ascii=False) diff --git a/goodByeHida.py b/goodByeHida.py new file mode 100644 index 0000000..2461a46 --- /dev/null +++ b/goodByeHida.py @@ -0,0 +1,64 @@ +import argparse +from buildSchemas import buildSchemas +from distutils.util import strtobool +from importer import Importer +from initDb import initDb +import os + +# Create the parser +parser = argparse.ArgumentParser(description="Run the program with specific configurations.") + +# Add the arguments +parser.add_argument('--production', type=str, default='False', help='Set to True if you want to parse the docs folder, else if parse test-docs') +parser.add_argument('--buildSchemas', type=str, default='False', help='Set to True to rebuild the JSONs for the database schemas') +parser.add_argument('--dropDb', type=str, default='False', help='Set to True to drop the database to restart from scratch') + +# Parse the arguments +args = parser.parse_args() + +_production = bool(strtobool(args.production)) +_buildSchemas = bool(strtobool(args.buildSchemas)) +_dropDb = bool(strtobool(args.dropDb)) + +if _production: + print('Running in production mode.') + docsDir: str = './docs/' # The directory containing the XML files. + schemaDir: str = './schemas/' # The directory to store the schemas. +else: + print('Running in test mode.') + docsDir = './test-docs/' + schemaDir = './test-schemas/' + +if _buildSchemas: + print('Creating the schema jsons...') + buildSchemas(docsDir, schemaDir) + +if _dropDb: + # Renew the database + print('Remove the database...') + if _production: + dbName = 'database.db' + else: + dbName = 'test.db' + if os.path.exists(dbName): + os.remove(dbName) + print('Database removed.') + else: + print('Database does not exist.') + + +# Initialize the database +print('Initializing the database...') +engine, metadata = initDb(_production, schemaDir) +if engine == False: + print('Database initialization failed.') + exit() + + +# Import the data +print('Importing the data...') +importer = Importer(engine, metadata, docsDir) +importer.importData() + +print('Finished.') + diff --git a/importer.py b/importer.py new file mode 100644 index 0000000..26362d8 --- /dev/null +++ b/importer.py @@ -0,0 +1,147 @@ +import os +import xml.etree.ElementTree as ET +import pandas as pd +import uuid +from utils import cleanEntityName, tableExists +from sqlalchemy.orm import Session +from tqdm import tqdm + + +def insertData2Db(engine: Session, tableName: str, columns: dict): + """Inserts data into a database table. + + Args: + engine (): The database engine to use. + tableName (str): The name of the table to insert the data into. + columns (dict): A list of dictionaries containing the data to insert. + """ + + if not tableExists(engine, tableName): + # If the table does not exist, print an error message and return. + print(f'Table {tableName} does not exist.') + return + + # Create a dataframe from the columns. + df = pd.DataFrame([columns]) # The dataframe to insert. + + # Insert the dataframe into the database. + df.to_sql(tableName, engine, if_exists='append', index=False) + + +class Importer: + def __init__(self, engine: Session, metadata: Session, docsDir: str): + self.engine = engine + self.metadata = metadata + self.docsDir = docsDir + + def importNode(self, node: ET.Element, parentUuid: str = None, parentKey: str = None): + """Imports a node from an XML file into the database. + + Args: + node (ET.Element): The node to import. + parentUuid (str, optional): The UUID of the parent node. Defaults to None. + parentKey (str, optional): The key of the parent node. Defaults to None. + + Returns: + Dict[Dict]: The data from the node. + """ + + data: dict[dict] = {'f__uuid': parentUuid} if parentUuid else {} # The data table to import + + # Iterate through the children of the node. + for child in node: + # Iterate through the children of the node. + + classKey: str = f"{child.get('key')}_{cleanEntityName(child.get('lbl'))}" # The key for the class + className: str = f"c__{classKey}" # The name of the class + fieldKey: str = f"{classKey}" # The key for the field + fieldName: str = f"f__{fieldKey}" # The name of the field + entityUuid: str = str(uuid.uuid4()) # The UUID for the entity + + childData: dict[str, str] = { + "f__uuid": entityUuid, + } # The data table (with uuid) for the child node + + if 'txt' in child.attrib: + # If the child node has a text attribute, it is an entity. + childData.update({fieldName: child.get('txt')}) + + if len(child) > 0: + # If the child node has children, import the data of the child node and its children. + + # Recursively import the data of the child node. + childData.update(self.importNode(child)) + + # Insert the data of the child node into the database. + insertData2Db(self.engine, className, childData) + + # Insert the relationship data into the database. + self.insertRelData(parentUuid, parentKey, entityUuid, classKey) + + else: + # If the child node has no children, import the data of the child node. + + key: str = f"f__{child.get('key')}_{cleanEntityName(child.get('lbl'))}" # The key for the row + + if child.text is not None: + row: dict = {key: child.text.replace('###{new_line}### ', '\n')} # The row to insert + + data.update(row) + return data + + def processXmlFile(self, filePath: str, fileName: str): + """Processes an XML file and imports the data into the database. + + Args: + filePath (str): The path to the XML file. + fileName (str): The name of the XML file. + """ + + tree = ET.parse(filePath) # The XML tree. + root = tree.getroot() # The root of the XML tree. + + for block in root.iter('block'): + # Iterate through the blocks in the XML file and import the data of each block. + if 'txt' in block.attrib: + # If the block has a 'txt' attribute, import the data of the block. + classKey: str = f"{block.get('txt')}" # The key for the class + blockUuid: str = str(uuid.uuid4()) # The UUID for the block + data: dict[dict] = self.importNode(block, blockUuid, classKey) # The data to import. + tableName: str = f"c__{cleanEntityName(block.get('txt'))}" # The name of the table to import the data into. + try: + insertData2Db(self.engine, tableName, data) + except Exception as e: + print(f"An error occurred while inserting data into {tableName}: {e}") + + def importData(self): + """Imports all XML files in a directory into the database. + Walks through the directory and processes each XML file. + """ + + # Get the total number of XML files + totalFiles = sum([len([f for f in files if f.endswith('.xml')]) for r, d, files in os.walk(self.docsDir)]) # Create a progress bar + with tqdm(total=totalFiles, desc="Processing XML files", ncols=75) as pbar: + for dirpath, dirnames, filenames in os.walk(self.docsDir): + # Walk through the directory and process each XML file + for fileName in filenames: + if fileName.endswith('.xml'): + self.processXmlFile(os.path.join(dirpath, fileName), fileName) + # Update the progress bar + pbar.update(1) + print('Data imported.') + + def insertRelData(self, parentUuid: str, parentKey: str, entityUuid: str, classKey: str): + """Imports the relationship data into the database. + Args: + parentUuid (str): The UUID of the parent entity. + parentKey (str): The key of the parent entity. + entityUuid (str): The UUID of the entity. + classKey (str): The key of the entity. + """ + + relationTableName: str = f"r__{parentKey}__{classKey}" # The name of the relation table + relRow = {f"f__{parentKey}__uuid": parentUuid, + f"f__{classKey}__uuid": entityUuid} # The row to insert into the relation table + relDf = pd.DataFrame([relRow]) # The dataframe to insert into the relation table + relDf.to_sql(relationTableName, self.engine, if_exists='append', + index=False) # Insert the dataframe into the relation table \ No newline at end of file diff --git a/initDb.py b/initDb.py new file mode 100644 index 0000000..1a73665 --- /dev/null +++ b/initDb.py @@ -0,0 +1,36 @@ +import os +from sqlalchemy import create_engine, MetaData +from initSchemas import initClassesFromSchemas, Base + + + +# Database Initialization +def initDb(_production, schemaDir): + """Initialize the database. + """ + + # Initialize the classes from the schemas + print('Initializing the classes from the schemas...') + if not initClassesFromSchemas(schemaDir): + print('Cannot initialize database. No schemas found.') + return (False, False) + + if _production: + dbName = 'database.db' + else: + dbName = 'test.db' + + # Get the directory of the script + dirPath = os.path.dirname(os.path.realpath(__file__)) + + # Create the path of the database file + dbPath = os.path.join(dirPath, dbName) + + engine = create_engine(f'sqlite:///{dbPath}') + metadata = MetaData() + + # Create all tables in the engine + Base.metadata.create_all(engine) + + print('Database initialized.') + return engine, metadata diff --git a/initSchemas.py b/initSchemas.py new file mode 100644 index 0000000..5796c5d --- /dev/null +++ b/initSchemas.py @@ -0,0 +1,59 @@ +import json +import os +from sqlalchemy import Column, Integer, String, Table +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + +def createClass(name, columns): + """Create a SQLAlchemy class from a JSON schema. + + Args: + name (str): The name of the class. + columns (list): The columns of the class. + + Returns: + SQLAlchemy.Class: The SQLAlchemy class. + """ + # Transform name and add prefix + className = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_') + tableName = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_') + + # Transform columns and add prefix + attrs = {'__tablename__': tableName} + attrs.update({prop.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')','_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_'): (Column(String, primary_key=True) if prop.lower() == 'uuid' else Column(String)) for prop in columns}) + + # If 'uuid' is not in columns, add 'id' as primary key + if 'uuid' not in [prop.lower() for prop in columns]: + attrs['id'] = Column(Integer, primary_key=True) + + # Create SQLAlchemy class + cls = type(className, (Base,), attrs) + + # Define the table with extend_existing=True + Table(tableName, Base.metadata, extend_existing=True) + + return cls + +def initClassesFromSchemas(schemaDir): + """Initialize the classes from the schemas. + """ + + if not os.path.exists(schemaDir): + print('Schema directory does not exist.') + return False + + schemaList = os.listdir(schemaDir) + + if not schemaList: + print('No schemas JSON\'s found.') + return False + + for fileName in schemaList: + if fileName.endswith('.json'): + with open(os.path.join(schemaDir, fileName), 'r') as f: + data = json.load(f) + cls = createClass(data['name'], data['columns']) + globals()[cls.__name__] = cls # Add the class to the global namespace + print('Classes initialized from schemas.') + return True \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eb935e4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas +sqlalchemy +tqdm \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..c8543a4 --- /dev/null +++ b/utils.py @@ -0,0 +1,9 @@ +from sqlalchemy import MetaData, Table + +def cleanEntityName(entityName): + return entityName.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_') + +def tableExists(engine, table_name): + metadata = MetaData() + metadata.reflect(bind=engine) + return table_name in metadata.tables \ No newline at end of file