first commit

2024-02-07 15:33:41 +01:00 · 2024-02-07 15:33:41 +01:00 · 52145ecabf
commit 52145ecabf
10 changed files with 538 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
+kram
+docs
+schemas
+test-docs
+test-schemas
+database.db
+test.db
+venv
+.idea
+__pycache__
--- a/README.md
+++ b/README.md
@ -0,0 +1,35 @@
+# Good Bye HIDA
+Small script to transform XML Documents of the HIDA/MIDAS architecture to a sqlite database.
+
+## Prerequisites
+create a virtual environment:
+```bash
+python3 -m venv venv
+```
+activate the virtual environment:
+```bash
+source venv/bin/activate
+```
+install requirements:
+```bash
+pip install -r requirements.txt
+```
+place the XML files in the `docs` folder or for evaluation purposes few files in the `test-docs` folder.
+
+## Usage
+To have a test run, place XML-files in a dir named `test-docs`, then type
+```bash
+python3 goodByeHida.py --buildSchemas True 
+```
+You will get a dir `test-schemas` and a sqlite database `test.db` with the imported data.
+
+If everything looks good you can run the script with the `docs` folder:
+```bash
+python3 goodByeHida.py --production True --buildSchemas True 
+```
+You will get a dir `schemas` and a sqlite database `databse.db` with the imported data.
+
+If you like to restart the process and delete the database, type:
+```bash
+python3 goodByeHida.py --production True --buildSchemas True --deleteDatabase True
+```
--- a/init.py
+++ b/init.py
@ -0,0 +1,5 @@
+"""Top-Level package for the project."""
+# __init__.py
+
+__app_name__ = "rptodo"
+__version__ = "0.1.0"
--- a/buildSchemas.py
+++ b/buildSchemas.py
@ -0,0 +1,170 @@
+import os
+import json
+from utils import cleanEntityName
+import xml.etree.ElementTree as ET
+import shutil
+from tqdm import tqdm
+
+
+def processNode(node, schemaDir: str, parentName: str = None) -> set:
+    """ Process a node.
+
+    Args:
+        node (Element): The node to process.
+        schemaDir (str): The path to the directory to store the schemas.
+        parentName (str, optional): The name of the parent node. Defaults to None.
+
+    Returns:
+        set: The set of columns.
+    """
+
+    if node.tag == "block":
+        # If the node is a block, it is the root node.
+        key_lbl: str = cleanEntityName(f"{node.get('txt')}")  # The name of the column.
+        columns: set = set([])  # The set of columns.
+    else:
+        # If the node is not a block, it is a child node.
+        key_lbl: str = cleanEntityName(f"{node.get('key')}_{node.get('lbl')}")  # The name of the column.
+        columns: set = {f"f__{key_lbl}"}  # The set of columns with its own name, cause it has children.
+    for child in node:
+        if len(child) > 0:
+            # If the child node has children, process the child node.
+            processNode(child, schemaDir, key_lbl)  # The columns of the child node.
+        elif 'txt' in child.attrib:
+            # If the child node has a text attribute, we need no column.
+            createRelTable(schemaDir, key_lbl, cleanEntityName(f"{child.get('key')}_{child.get('lbl')}"))
+            childName: str = cleanEntityName(f"{child.get('key')}_{child.get('lbl')}")  # The name of the child column.
+
+            childColumns = set([f"f__uuid", f"f__{childName}"])
+            filePathEntity: str = os.path.join(schemaDir, f"c__{childName}.json")
+            if os.path.exists(filePathEntity):
+                # If the entity file exists, load the existing columns.
+                with open(filePathEntity, 'r', encoding='utf-8') as f:
+                    # Load the existing columns from the entity file.
+                    existingChildColumns: list = json.load(f).get("columns", [])
+                    childColumns.update(existingChildColumns)
+            with open(filePathEntity, 'w', encoding='utf-8') as f:
+                # Open the entity file to write.
+
+                # Write the entity file with the columns.
+                json.dump({"name": f"c__{childName}", "columns": list(childColumns)}, f, ensure_ascii=False)
+        else:
+            # Iterate through the children of the node.
+            childName: str = cleanEntityName(f"{child.get('key')}_{child.get('lbl')}")  # The name of the child column.
+            # Add the child column to the set of columns.
+            columns.add(f"f__{childName}")
+
+    if columns and len(node) > 0:
+        # Check if the node has children
+
+        columnsList: list = sorted(list(columns))  # Sorted list of the columns.
+
+        # Add the uuid column to the list of columns
+        columnsList.append("f__uuid")
+        filePathEntity: str = os.path.join(schemaDir, f"c__{key_lbl}.json")  # The path to the entity file.
+    if os.path.exists(filePathEntity):
+        # If the entity file exists, load the existing columns.
+        with open(filePathEntity, 'r', encoding='utf-8') as f:
+            # Load the existing columns from the entity file.
+            existingColumns: list = json.load(f).get("columns", [])  # The existing columns.
+
+            # Add the existing columns to the list of columns.
+            columnsList.extend(existingColumns)
+
+            # Remove duplicates
+            columnsList = sorted(list(set(columnsList)))
+
+    with open(filePathEntity, 'w', encoding='utf-8') as f:
+        # Open the entity file to write.
+
+        # Write the entity file with the columns.
+        json.dump({"name": f"c__{key_lbl}", "columns": columnsList}, f, ensure_ascii=False)
+    if parentName:
+        # If the node has a parent, create a relationship table.
+        createRelTable(schemaDir, parentName, key_lbl)
+    return columns
+
+
+def processXmlFile(filePath, schemaDir):
+    """Process an XML file.
+
+    Args:
+        filePath (str): The path to the XML file.
+        schemaDir (str): The path to the directory to store the schemas.
+    """
+    tree = ET.parse(filePath)  # The XML tree.
+    root = tree.getroot()  # The root of the XML tree.
+
+    os.makedirs(schemaDir, exist_ok=True)
+
+    for block in root.iter('block'):
+        # Iterate through the blocks in the XML file and process each block.
+        if 'txt' in block.attrib:
+            # If the block has a text attribute, process the block.
+            columns: set = processNode(block, schemaDir)  # The columns of the block.
+            columnsList: list = sorted(list(columns))  # Sorted list of the columns.
+            filePath: str = os.path.join(schemaDir, f"c__{block.get('txt')}.json")  # The path to the file.
+
+            if os.path.exists(filePath):
+                # If the file exists, load the existing columns.
+                with open(filePath, 'r', encoding='utf-8') as f:
+                    # Load the existing columns from the file.
+                    existingColumns: list = json.load(f).get("columns", [])  # The existing columns.
+
+                    # Add the existing columns to the list of columns.
+                    columnsList.extend(existingColumns)
+
+                    # Remove duplicates from the list of columns.
+                    columnsList = sorted(list(set(columnsList)))
+
+            with open(filePath, 'w', encoding='utf-8') as f:
+                # Open the file to write.
+
+                # Write the file with the columns.
+                json.dump({"name": f"c__{block.get('txt')}", "columns": columnsList}, f, ensure_ascii=False)
+
+
+def buildSchemas(dirPath, schemaDir):
+    """Parse schemas from XML files and saves them as json.
+
+    Args:
+        dirPath (str): The path to the directory containing the XML files.
+        schemaDir (str): The path to the directory to store the schemas.
+    """
+    if os.path.exists(schemaDir):
+        # Remove the existing schema directory
+        shutil.rmtree(schemaDir)
+
+    # Get the total number of XML files
+    totalFiles = sum([len([f for f in files if f.endswith('.xml')]) for r, d, files in os.walk(dirPath)])
+
+    with tqdm(total=totalFiles, desc="Processing XML files", ncols=75) as pbar:
+        for dirpath, dirnames, filenames in os.walk(dirPath):
+            # Walk through the directory and process each XML file
+            for fileName in filenames:
+                if fileName.endswith('.xml'):
+                    processXmlFile(os.path.join(dirpath, fileName), schemaDir)
+                    # Update the progress bar
+                    pbar.update(1)
+    print('Schemas built.')
+
+
+def createRelTable(schemaDir: str, parentName: str, key_lbl: str):
+    """Create a relationship table.
+        Args:
+            schemaDir (str): The path to the directory to store the schemas.
+            parentName (str): The name of the parent node.
+            key_lbl (str): The name of the column.
+    """
+    tableName = f"r__{parentName}__{key_lbl}"
+    filePathRelTable: str = os.path.join(schemaDir,
+                                         f"{tableName}.json"
+                                         )  # The path to the relationship table file.
+
+    with open(filePathRelTable, 'w', encoding='utf-8') as f:
+        # Open the relationship table file to write.
+
+        # Write the relationship table file with the columns.
+        json.dump(
+            {"name": tableName, "columns": [f"f__{parentName}__uuid", f"f__{key_lbl}__uuid"]},
+            f, ensure_ascii=False)
--- a/goodByeHida.py
+++ b/goodByeHida.py
@ -0,0 +1,64 @@
+import argparse
+from buildSchemas import buildSchemas
+from distutils.util import strtobool
+from importer import Importer
+from initDb import initDb
+import os
+
+# Create the parser
+parser = argparse.ArgumentParser(description="Run the program with specific configurations.")
+
+# Add the arguments
+parser.add_argument('--production', type=str, default='False', help='Set to True if you want to parse the docs folder, else if parse test-docs')
+parser.add_argument('--buildSchemas', type=str, default='False', help='Set to True to rebuild the JSONs for the database schemas')
+parser.add_argument('--dropDb', type=str, default='False', help='Set to True to drop the database to restart from scratch')
+
+# Parse the arguments
+args = parser.parse_args()
+
+_production = bool(strtobool(args.production))
+_buildSchemas = bool(strtobool(args.buildSchemas))
+_dropDb = bool(strtobool(args.dropDb))
+
+if _production:
+    print('Running in production mode.')
+    docsDir: str = './docs/'  # The directory containing the XML files.
+    schemaDir: str = './schemas/'  # The directory to store the schemas.
+else:
+    print('Running in test mode.')
+    docsDir = './test-docs/'
+    schemaDir = './test-schemas/'
+
+if _buildSchemas:
+    print('Creating the schema jsons...')
+    buildSchemas(docsDir, schemaDir)
+
+if _dropDb:
+    # Renew the database
+    print('Remove the database...')
+    if _production:
+        dbName = 'database.db'
+    else:
+        dbName = 'test.db'
+    if os.path.exists(dbName):
+        os.remove(dbName)
+        print('Database removed.')
+    else:
+        print('Database does not exist.')
+
+
+# Initialize the database
+print('Initializing the database...')
+engine, metadata = initDb(_production, schemaDir)
+if engine == False:
+    print('Database initialization failed.')
+    exit()
+
+
+# Import the data
+print('Importing the data...')
+importer = Importer(engine, metadata, docsDir)
+importer.importData()
+
+print('Finished.')
+
--- a/importer.py
+++ b/importer.py
@ -0,0 +1,147 @@
+import os
+import xml.etree.ElementTree as ET
+import pandas as pd
+import uuid
+from utils import cleanEntityName, tableExists
+from sqlalchemy.orm import Session
+from tqdm import tqdm
+
+
+def insertData2Db(engine: Session, tableName: str, columns: dict):
+    """Inserts data into a database table.
+
+  Args:
+      engine (): The database engine to use.
+      tableName (str): The name of the table to insert the data into.
+      columns (dict): A list of dictionaries containing the data to insert.
+  """
+
+    if not tableExists(engine, tableName):
+        # If the table does not exist, print an error message and return.
+        print(f'Table {tableName} does not exist.')
+        return
+
+    # Create a dataframe from the columns.
+    df = pd.DataFrame([columns])  # The dataframe to insert.
+
+    # Insert the dataframe into the database.
+    df.to_sql(tableName, engine, if_exists='append', index=False)
+
+
+class Importer:
+    def __init__(self, engine: Session, metadata: Session, docsDir: str):
+        self.engine = engine
+        self.metadata = metadata
+        self.docsDir = docsDir
+
+    def importNode(self, node: ET.Element, parentUuid: str = None, parentKey: str = None):
+        """Imports a node from an XML file into the database.
+
+        Args:
+            node (ET.Element): The node to import.
+            parentUuid (str, optional): The UUID of the parent node. Defaults to None.
+            parentKey (str, optional): The key of the parent node. Defaults to None.
+
+        Returns:
+            Dict[Dict]: The data from the node.
+        """
+
+        data: dict[dict] = {'f__uuid': parentUuid} if parentUuid else {}  # The data table to import
+
+        # Iterate through the children of the node.
+        for child in node:
+            # Iterate through the children of the node.
+
+            classKey: str = f"{child.get('key')}_{cleanEntityName(child.get('lbl'))}"  # The key for the class
+            className: str = f"c__{classKey}"  # The name of the class
+            fieldKey: str = f"{classKey}"  # The key for the field
+            fieldName: str = f"f__{fieldKey}"  # The name of the field
+            entityUuid: str = str(uuid.uuid4())  # The UUID for the entity
+
+            childData: dict[str, str] = {
+                "f__uuid": entityUuid,
+            }  # The data table (with uuid) for the child node
+
+            if 'txt' in child.attrib:
+                # If the child node has a text attribute, it is an entity.
+                childData.update({fieldName: child.get('txt')})
+
+                if len(child) > 0:
+                    # If the child node has children, import the data of the child node and its children.
+
+                    # Recursively import the data of the child node.
+                    childData.update(self.importNode(child))
+
+                # Insert the data of the child node into the database.
+                insertData2Db(self.engine, className, childData)
+
+                # Insert the relationship data into the database.
+                self.insertRelData(parentUuid, parentKey, entityUuid, classKey)
+
+            else:
+                # If the child node has no children, import the data of the child node.
+
+                key: str = f"f__{child.get('key')}_{cleanEntityName(child.get('lbl'))}"  # The key for the row
+
+                if child.text is not None:
+                    row: dict = {key: child.text.replace('###{new_line}### ', '\n')}  # The row to insert
+
+                    data.update(row)
+        return data
+
+    def processXmlFile(self, filePath: str, fileName: str):
+        """Processes an XML file and imports the data into the database.
+
+      Args:
+          filePath (str): The path to the XML file.
+          fileName (str): The name of the XML file.
+      """
+
+        tree = ET.parse(filePath)  # The XML tree.
+        root = tree.getroot()  # The root of the XML tree.
+
+        for block in root.iter('block'):
+            # Iterate through the blocks in the XML file and import the data of each block.
+            if 'txt' in block.attrib:
+                # If the block has a 'txt' attribute, import the data of the block.
+                classKey: str = f"{block.get('txt')}"  # The key for the class
+                blockUuid: str = str(uuid.uuid4())  # The UUID for the block
+                data: dict[dict] = self.importNode(block, blockUuid, classKey)  # The data to import.
+                tableName: str = f"c__{cleanEntityName(block.get('txt'))}"  # The name of the table to import the data into.
+                try:
+                    insertData2Db(self.engine, tableName, data)
+                except Exception as e:
+                    print(f"An error occurred while inserting data into {tableName}: {e}")
+
+    def importData(self):
+        """Imports all XML files in a directory into the database.
+            Walks through the directory and processes each XML file.
+        """
+
+        # Get the total number of XML files
+        totalFiles = sum([len([f for f in files if f.endswith('.xml')]) for r, d, files in os.walk(self.docsDir)])        # Create a progress bar
+        with tqdm(total=totalFiles, desc="Processing XML files", ncols=75) as pbar:
+            for dirpath, dirnames, filenames in os.walk(self.docsDir):
+                # Walk through the directory and process each XML file
+                for fileName in filenames:
+                    if fileName.endswith('.xml'):
+                        self.processXmlFile(os.path.join(dirpath, fileName), fileName)
+                        # Update the progress bar
+                        pbar.update(1)
+        print('Data imported.')
+
+    def insertRelData(self, parentUuid: str, parentKey: str, entityUuid: str, classKey: str):
+        """Imports the relationship data into the database.
+            Args:
+                parentUuid (str): The UUID of the parent entity.
+                parentKey (str): The key of the parent entity.
+                entityUuid (str): The UUID of the entity.
+                classKey (str): The key of the entity.
+        """
+
+        relationTableName: str = f"r__{parentKey}__{classKey}"  # The name of the relation table
+        relRow = {f"f__{parentKey}__uuid": parentUuid,
+                  f"f__{classKey}__uuid": entityUuid}  # The row to insert into the relation table
+        relDf = pd.DataFrame([relRow])  # The dataframe to insert into the relation table
+        relDf.to_sql(relationTableName, self.engine, if_exists='append',
+                     index=False)  # Insert the dataframe into the relation table
--- a/initDb.py
+++ b/initDb.py
@ -0,0 +1,36 @@
+import os
+from sqlalchemy import create_engine, MetaData
+from initSchemas import initClassesFromSchemas, Base
+
+
+
+# Database Initialization
+def initDb(_production, schemaDir):
+    """Initialize the database.
+    """
+
+    # Initialize the classes from the schemas
+    print('Initializing the classes from the schemas...')
+    if not initClassesFromSchemas(schemaDir):
+        print('Cannot initialize database. No schemas found.')
+        return (False, False)
+
+    if _production:
+        dbName = 'database.db'
+    else:
+        dbName = 'test.db'
+
+    # Get the directory of the script
+    dirPath = os.path.dirname(os.path.realpath(__file__))
+
+    # Create the path of the database file
+    dbPath = os.path.join(dirPath, dbName)
+
+    engine = create_engine(f'sqlite:///{dbPath}')
+    metadata = MetaData()
+
+    # Create all tables in the engine
+    Base.metadata.create_all(engine)
+
+    print('Database initialized.')
+    return engine, metadata
--- a/initSchemas.py
+++ b/initSchemas.py
@ -0,0 +1,59 @@
+import json
+import os
+from sqlalchemy import Column, Integer, String, Table
+from sqlalchemy.ext.declarative import declarative_base
+
+Base = declarative_base()
+
+def createClass(name, columns):
+    """Create a SQLAlchemy class from a JSON schema.
+
+    Args:
+        name (str): The name of the class.
+        columns (list): The columns of the class.
+
+    Returns:
+        SQLAlchemy.Class: The SQLAlchemy class.
+    """
+    # Transform name and add prefix
+    className = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_')
+    tableName = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_')
+
+    # Transform columns and add prefix
+    attrs = {'__tablename__': tableName}
+    attrs.update({prop.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')','_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_'): (Column(String, primary_key=True) if prop.lower() == 'uuid' else Column(String)) for prop in columns})
+
+    # If 'uuid' is not in columns, add 'id' as primary key
+    if 'uuid' not in [prop.lower() for prop in columns]:
+        attrs['id'] = Column(Integer, primary_key=True)
+
+    # Create SQLAlchemy class
+    cls = type(className, (Base,), attrs)
+
+    # Define the table with extend_existing=True
+    Table(tableName, Base.metadata, extend_existing=True)
+
+    return cls
+
+def initClassesFromSchemas(schemaDir):
+    """Initialize the classes from the schemas.
+    """
+
+    if not os.path.exists(schemaDir):
+        print('Schema directory does not exist.')
+        return False
+
+    schemaList = os.listdir(schemaDir)
+
+    if not schemaList:
+        print('No schemas JSON\'s found.')
+        return False
+
+    for fileName in schemaList:
+        if fileName.endswith('.json'):
+            with open(os.path.join(schemaDir, fileName), 'r') as f:
+                data = json.load(f)
+                cls = createClass(data['name'], data['columns'])
+                globals()[cls.__name__] = cls  # Add the class to the global namespace
+    print('Classes initialized from schemas.')
+    return True
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+pandas
+sqlalchemy
+tqdm
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,9 @@
+from sqlalchemy import MetaData, Table
+
+def cleanEntityName(entityName):
+  return entityName.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_')
+
+def tableExists(engine, table_name):
+  metadata = MetaData()
+  metadata.reflect(bind=engine)
+  return table_name in metadata.tables