From e46a9fd4ece7e545efc15bdb38df605207d32cd7 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Tue, 9 Sep 2025 10:16:31 +0200 Subject: [PATCH] new commit --- .gitignore | 2 + .vscode/launch.json | 15 + 00_start.py | 160 ++++++ 01_importMaterials.py | 81 --- 01_importMaterialsAndTechnique.py | 79 +++ 02_importAdministrator.py | 165 +++--- 03_importAdministratorStatus.py | 132 +++-- 03_importSource.py | 204 ++++---- 04_importArtistSourceReferenceAssignment.py | 125 ++--- 04_importMarks.py | 479 +++++++++--------- 04_importSourceReferenceAssignment.py | 123 ++--- 05_importArtist.py | 273 +++++----- 06_importLiterature.py | 200 ++++---- 07_importInspectionMark.py | 341 ++++++------- 07_importJournalAssignment.py | 128 +++-- 07_importLiteratureReferenceAssignment.py | 129 +++-- 07_importParentLiteratureAssignment.py | 129 +++-- 08_importInspectionMarkLocation.py | 119 ++--- 09_importInspectionMarkRelation.py | 118 ++--- 10_importMarkDatingInfo.py | 119 ++--- 11_importMarkInformation.py | 97 ---- 12_importBirth.py | 150 +++--- 13_importDeath.py | 154 +++--- 14_importDating.py | 125 ++--- 15_importGoldsmithRelation.py | 138 +++-- 16_importClient.py | 132 +++-- 17_importMentioned.py | 162 +++--- 18_importNumDating.py | 125 ++--- 19_importOriginAssignment.py | 131 +++-- 20_importWorkshops.py | 127 +++-- 21_importArtifacts.py | 207 ++++++++ 21_importArtifcats.py | 213 -------- 22_importArtifactRelation.py | 129 +++-- 24_importArtistAssignment.py | 120 ++--- 25_importMarkInformation.py | 82 +++ 25_importPhotographer.py | 90 ---- 26_importPhotographer.py | 75 +++ ..._importArtifactToArtistRelationRelation.py | 120 ++--- ...mportArtifactToClientAssignmentRelation.py | 121 ++--- ...rtifactToInspectionMarkLocationRelation.py | 120 ++--- ...ToLiteratureReferenceAssignmentRelation.py | 122 ++--- ...factToMarkInformationAssignmentRelation.py | 121 ++--- 98__r__importArtifactToMaterialRelation.py | 121 ++--- ...r__importArtifactToNumericeDateRelation.py | 121 ++--- 98__r__importArtifactToPhotographRelation.py | 121 ++--- 98__r__importArtifactToRelationRelation.py | 121 ++--- 98__r__importArtifactToSourceRelation.py | 121 ++--- ...rtArtifactToStatusAdministratorRelation.py | 127 +++-- 98__r__importArtistToBirthRelation.py | 127 +++-- 98__r__importArtistToDeathRelation.py | 127 +++-- 98__r__importArtistToGoldsmithRelation.py | 127 +++-- ...portArtistToLiteratureReferenceRelation.py | 127 +++-- 98__r__importArtistToMentionedRelation.py | 127 +++-- 98__r__importArtistToOriginRelation.py | 127 +++-- 98__r__importArtistToWorkshopRelation.py | 127 +++-- ...MarkDatingInformationAssignmentRelation.py | 124 ++--- ...r__importInspectionMarkRelationRelation.py | 127 +++-- ...ectionMarkToLiteratureReferenceRelation.py | 127 +++-- 98__r__importLiteratureToJournalRelation.py | 127 +++-- ...rtLiteratureToParentPublicationRelation.py | 127 +++-- 98__r__importMarkToDatingRelation.py | 124 ++--- 98__r__importMarkToLiteratureRelation.py | 128 +++-- 98__r__importMarkToMarkInformationRelation.py | 127 +++-- 98__r__importMarkToSourceRelation.py | 127 +++-- 98__r__importSourceToDateRelation.py | 127 +++-- ...ToLiteratureReferenceAssignmentRelation.py | 127 +++-- initDb.py | 2 +- initSchemas.py | 5 +- requirements.txt | 2 +- 69 files changed, 4199 insertions(+), 4805 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 00_start.py delete mode 100644 01_importMaterials.py create mode 100644 01_importMaterialsAndTechnique.py delete mode 100644 11_importMarkInformation.py create mode 100644 21_importArtifacts.py delete mode 100644 21_importArtifcats.py create mode 100644 25_importMarkInformation.py delete mode 100644 25_importPhotographer.py create mode 100644 26_importPhotographer.py diff --git a/.gitignore b/.gitignore index 77cb159..f9c5ae4 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ wisski_py __pycache__ logs/* .venv +.env +.vscode diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e0121b7 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/00_start.py b/00_start.py new file mode 100644 index 0000000..0c1c0e9 --- /dev/null +++ b/00_start.py @@ -0,0 +1,160 @@ +from importlib import import_module +from initDb import initDb # For database initialization +from wisski.api import Api, Pathbuilder, Entity # For WissKI API +import os # For environment variable loading +from dotenv import load_dotenv # For environment variable loading +from time import sleep + +# Import entities +material_module = import_module("01_importMaterialsAndTechnique") +administrator_module = import_module("02_importAdministrator") +administrator_status_module = import_module("03_importAdministratorStatus") +source_module = import_module("03_importSource") +artist_source_reference_assignment_module = import_module("04_importArtistSourceReferenceAssignment") +marks_module = import_module("04_importMarks") +source_reference_assignment_module = import_module("04_importSourceReferenceAssignment") +artist_module = import_module("05_importArtist") +literature_module = import_module("06_importLiterature") +inspection_mark_module = import_module("07_importInspectionMark") +journal_assignment_module = import_module("07_importJournalAssignment") +literature_reference_assignment_module = import_module("07_importLiteratureReferenceAssignment") +parent_literature_assignment_module = import_module("07_importParentLiteratureAssignment") +inspection_mark_location_module = import_module("08_importInspectionMarkLocation") +inspection_mark_relation_module = import_module("09_importInspectionMarkRelation") +mark_dating_info_module = import_module("10_importMarkDatingInfo") +birth_module = import_module("12_importBirth") +death_module = import_module("13_importDeath") +dating_module = import_module("14_importDating") +goldsmith_relation_module = import_module("15_importGoldsmithRelation") +client_module = import_module("16_importClient") +mentioned_module = import_module("17_importMentioned") +num_dating_module = import_module("18_importNumDating") +origin_assignment_module = import_module("19_importOriginAssignment") +workshops_module = import_module("20_importWorkshops") +artifacts_module = import_module("21_importArtifacts") +artifact_relation_module = import_module("22_importArtifactRelation") +artist_assignment_module = import_module("24_importArtistAssignment") +mark_information_module = import_module("25_importMarkInformation") +photographer_module = import_module("26_importPhotographer") + +# Import relations +artifact_to_artist_relation_module = import_module("98__r__importArtifactToArtistRelationRelation") +artifact_to_client_assignment_relation_module = import_module("98__r__importArtifactToClientAssignmentRelation") +artifact_to_inspection_mark_location_relation_module = import_module("98__r__importArtifactToInspectionMarkLocationRelation") +artifact_to_literature_reference_assignment_relation_module = import_module("98__r__importArtifactToLiteratureReferenceAssignmentRelation") +artifact_to_mark_information_assignment_relation_module = import_module("98__r__importArtifactToMarkInformationAssignmentRelation") +artifact_to_material_relation_module = import_module("98__r__importArtifactToMaterialRelation") +artifact_to_numerice_date_relation_module = import_module("98__r__importArtifactToNumericeDateRelation") +artifact_to_photograph_relation_module = import_module("98__r__importArtifactToPhotographRelation") +artifact_to_relation_relation_module = import_module("98__r__importArtifactToRelationRelation") +artifact_to_source_relation_module = import_module("98__r__importArtifactToSourceRelation") +artifact_to_status_administrator_relation_module = import_module("98__r__importArtifactToStatusAdministratorRelation") +artist_to_birth_relation_module = import_module("98__r__importArtistToBirthRelation") +artist_to_death_relation_module = import_module("98__r__importArtistToDeathRelation") +artist_to_goldsmith_relation_module = import_module("98__r__importArtistToGoldsmithRelation") +artist_to_literature_reference_relation_module = import_module("98__r__importArtistToLiteratureReferenceRelation") +artist_to_mentioned_relation_module = import_module("98__r__importArtistToMentionedRelation") +artist_to_origin_relation_module = import_module("98__r__importArtistToOriginRelation") +artist_to_workshop_relation_module = import_module("98__r__importArtistToWorkshopRelation") +inspection_mark_dating_information_assignment_relation_module = import_module("98__r__importInspectionMarkDatingInformationAssignmentRelation") +inspection_mark_relation_relation_module = import_module("98__r__importInspectionMarkRelationRelation") +inspection_mark_to_literature_reference_relation_module = import_module("98__r__importInspectionMarkToLiteratureReferenceRelation") +literature_to_journal_relation_module = import_module("98__r__importLiteratureToJournalRelation") +literature_to_parent_publication_relation_module = import_module("98__r__importLiteratureToParentPublicationRelation") +mark_to_dating_relation_module = import_module("98__r__importMarkToDatingRelation") +mark_to_literature_relation_module = import_module("98__r__importMarkToLiteratureRelation") +mark_to_mark_information_relation_module = import_module("98__r__importMarkToMarkInformationRelation") +mark_to_source_relation_module = import_module("98__r__importMarkToSourceRelation") +source_to_date_relation_module = import_module("98__r__importSourceToDateRelation") +source_to_literature_reference_assignment_relation_module = import_module("98__r__importSourceToLiteratureReferenceAssignmentRelation") + +# Initialize the database +print('Initializing the database...') +engine, metadata, Session = initDb(True, './schemas/') +if engine == False: + print('Database initialization failed.') + exit() + +# Load the environment variables +load_dotenv() + +# Initialize the WissKI API +print('Initializing the WissKI API...') +api_url = os.getenv('API_URL') +auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) +headers = {"Cache-Control": "no-cache"} +api = Api(api_url, auth, headers) +api.pathbuilders = ['default'] + +trials = 0 +while trials < 3 : + trials += 1 + try: + # Call the function from the imported module + material_module.importMaterialsAndTechnique(api, engine) + administrator_module.importAdministrator(api, engine) + administrator_status_module.importAdministratorStatus(api, engine) + source_module.importSource(api, engine) + artist_source_reference_assignment_module.importArtistSourceReferenceAssignment(api, engine) + marks_module.importMarks(api, engine) + source_reference_assignment_module.importSourceReferenceAssignment(api, engine) + artist_module.importArtist(api, engine) + literature_module.importLiterature(api, engine) + inspection_mark_module.importInspectionMark(api, engine) + journal_assignment_module.importJournalAssignment(api, engine) + literature_reference_assignment_module.importLiteratureReferenceAssignment(api, engine) + parent_literature_assignment_module.importParentLiteratureAssignment(api, engine) + inspection_mark_location_module.importInspectionMarkLocation(api, engine) + inspection_mark_relation_module.importInspectionMarkRelation(api, engine) + mark_dating_info_module.importMarkDatingInfo(api, engine) + birth_module.importBirth(api, engine) + death_module.importDeath(api, engine) + dating_module.importDating(api, engine) + goldsmith_relation_module.importGoldsmithRelation(api, engine) + client_module.importClient(api, engine) + mentioned_module.importMentioned(api, engine) + num_dating_module.importNumDating(api, engine) + origin_assignment_module.importOriginAssignment(api, engine) + workshops_module.importWorkshops(api, engine) + artifacts_module.importArtifacts(api, engine) + artifact_relation_module.importArtifactRelation(api, engine) + artist_assignment_module.importArtistAssignment(api, engine) + mark_information_module.importMarkInformation(api, engine) + photographer_module.importPhotographer(api, engine) + + api.pathbuilders = ['relations'] + artifact_to_artist_relation_module.importArtifactToArtistRelationRelation(api, engine) + artifact_to_client_assignment_relation_module.importArtifactToClientAssignmentRelation(api, engine) + artifact_to_inspection_mark_location_relation_module.importArtifactToInspectionMarkLocationRelation(api, engine) + artifact_to_literature_reference_assignment_relation_module.importArtifactToLiteratureReferenceAssignmentRelation(api, engine) + artifact_to_mark_information_assignment_relation_module.importArtifactToMarkInformationAssignmentRelation(api, engine) + artifact_to_material_relation_module.importArtifactToMaterialRelation(api, engine) + artifact_to_numerice_date_relation_module.importArtifactToNumericeDateRelation(api, engine) + artifact_to_photograph_relation_module.importArtifactToPhotographRelation(api, engine) + artifact_to_relation_relation_module.importArtifactToRelationRelation(api, engine) + artifact_to_source_relation_module.importArtifactToSourceRelation(api, engine) + artifact_to_status_administrator_relation_module.importArtifactToStatusAdministratorRelation(api, engine) + artist_to_birth_relation_module.importArtistToBirthRelation(api, engine) + artist_to_death_relation_module.importArtistToDeathRelation(api, engine) + artist_to_goldsmith_relation_module.importArtistToGoldsmithRelation(api, engine) + artist_to_literature_reference_relation_module.importArtistToLiteratureReferenceRelation(api, engine) + artist_to_mentioned_relation_module.importArtistToMentionedRelation(api, engine) + artist_to_origin_relation_module.importArtistToOriginRelation(api, engine) + artist_to_workshop_relation_module.importArtistToWorkshopRelation(api, engine) + inspection_mark_dating_information_assignment_relation_module.importInspectionMarkDatingInformationAssignmentRelation(api, engine) + inspection_mark_relation_relation_module.importInspectionMarkRelationRelation(api, engine) + inspection_mark_to_literature_reference_relation_module.importInspectionMarkToLiteratureReferenceRelation(api, engine) + literature_to_journal_relation_module.importLiteratureToJournalRelation(api, engine) + literature_to_parent_publication_relation_module.importLiteratureToParentPublicationRelation(api, engine) + mark_to_dating_relation_module.importMarkToDatingRelation(api, engine) + mark_to_literature_relation_module.importMarkToLiteratureRelation(api, engine) + mark_to_mark_information_relation_module.importMarkToMarkInformationRelation(api, engine) + mark_to_source_relation_module.importMarkToSourceRelation(api, engine) + source_to_date_relation_module.importSourceToDateRelation(api, engine) + source_to_literature_reference_assignment_relation_module.importSourceToLiteratureReferenceAssignmentRelation(api, engine) + except Exception as e: + print(f'Error: {e}') + print(f'Trial {trials} of 3 failed.') + print(f'Retrying in 10 seconds...') + sleep(10) + continue diff --git a/01_importMaterials.py b/01_importMaterials.py deleted file mode 100644 index d7a3f29..0000000 --- a/01_importMaterials.py +++ /dev/null @@ -1,81 +0,0 @@ -import uuid # For UUID creation -from initDb import initDb # For database initialization -from wisski.api import Api, Pathbuilder, Entity # For WissKI API -import os # For environment variable loading -from dotenv import load_dotenv # For environment variable loading -import pandas as pd # For dataframe handling - -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() - -# Load the environment variables -load_dotenv() - -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilders = ['default'] - -try: - processedRows = pd.read_csv(f'./logs/processedMaterials.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) - -# Load materials table -materialsTable = pd.read_sql_table('c__5280_material', con=engine) - -# Create materials -for index, row in materialsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and materialsTable.loc[index, 'id'] == processedRows.iloc[index, 'id']: - # skip if already processed - print(f'Skipping already processed material {materialsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - materialValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value - continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': - continue - case 'f__uuid': - materialValues['fedfe553c2332bd4902c887813f29ed8'] = value # UUID - case 'f__5280_material': - materialValues['f5f4251312f54c0d104ea87761b94bde'] = value # Material - case 'f__5300_technik': - materialValues['f231e08850022f091ebd5055d8aad30f'] = value # Technique - case _: - print(f'{key} is not a valid field, skipping.') - - # Create Material - material = Entity(api=api, fields=materialValues, bundle_id='b45978f2b073ff3c73b3c7220ebb3b89') - api.save(material) - - print(f'Created material {index}: {material.uri}') - - # Write log - processedRows = processedRows._append({'id': row['id'], 'uuid': materialValues['fedfe553c2332bd4902c887813f29ed8'][0], 'uri': material.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedMaterials.csv', index=False) - -print('finish') diff --git a/01_importMaterialsAndTechnique.py b/01_importMaterialsAndTechnique.py new file mode 100644 index 0000000..bab2a1b --- /dev/null +++ b/01_importMaterialsAndTechnique.py @@ -0,0 +1,79 @@ +import uuid # For UUID creation +from initDb import initDb # For database initialization +from wisski.api import Api, Pathbuilder, Entity # For WissKI API +import os # For environment variable loading +from dotenv import load_dotenv # For environment variable loading +import pandas as pd # For dataframe handling + +def importMaterialsAndTechnique(api, engine): + print('Importing materials and technique...') + + tableName = 'c__5280_material' + bundleId = 'b45978f2b073ff3c73b3c7220ebb3b89' + + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) + + # Load materials table + sqlTable = pd.read_sql_table(tableName, con=engine) + + # Create materials + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed material {sqlTable.loc[index, "id"]}') + continue + # Create Entity property dicts + materialValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # If value is a list of comma-separated strings, split each item by ',' and flatten. + if isinstance(value, list): + new_value = [] + for v in value: + if isinstance(v, str) and ',' in v: + new_value.extend([x.strip() for x in v.split(',') if x.strip()]) + else: + new_value.append(v) + value = new_value + + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + materialValues['fedfe553c2332bd4902c887813f29ed8'] = value # UUID + case 'f__5280_material': + materialValues['f5f4251312f54c0d104ea87761b94bde'] = value # Material + case 'f__5300_technik': + materialValues['f231e08850022f091ebd5055d8aad30f'] = value # Technique + case _: + print(f'{key} is not a valid field, skipping.') + + # Create Material + material = Entity(api=api, fields=materialValues, bundle_id=bundleId) + api.save(material) + + print(f'Created material {index}: {material.uri} of {len(sqlTable)}') + + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': materialValues['fedfe553c2332bd4902c887813f29ed8'][0], 'uri': material.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + + print('finish') diff --git a/02_importAdministrator.py b/02_importAdministrator.py index 91611f7..2e6c659 100644 --- a/02_importAdministrator.py +++ b/02_importAdministrator.py @@ -5,105 +5,92 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importAdministrator(api, engine): + print('Importing administrators...') + tableName = 'c__vwr' + bundleId = 'b4e5a6a31ff575ab09b07b5f27d322ab' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilders = ['default'] + # Load sources table + administratorsTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processedAdministrators.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['administratorId', 'uuid', 'uri']) -# Load sources table -administratorsTable = pd.read_sql_table('c__vwr', con=engine) -administratorValues = {} -digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - -# Create administrators -for index, row in administratorsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and administratorsTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed administrator {administratorsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - administratorValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create administrators + for index, row in administratorsTable.iterrows(): + administratorValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + # For every row in table... + if index < len(processedRows) and administratorsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed administrator {administratorsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{' in str(value): - print('replaced curly braces') - value = str(value).replace('###{new_line', '') - value = str(value).replace('}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + administratorValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - administratorValues['f707e595ce7301d61c064e8e44c9c4f4'] = value # UUID - case 'f__vwra_vwr_adresse': - administratorValues['f303bbabf3d97536777b0f552d20bc7a'] = value # Address - case 'f__vwrn_vwr_dok_nr_': - administratorValues['f37e82c36b4fc6b275a1a86a389481e1'] = value # Administrator document number - case 'f__vwrb_verw_publ_bez': - administratorValues['ffc50ffbcc3f411ed63e3c6dfc6b4d80'] = value # Appellation in publication - case 'f__9990_kommentar': - administratorValues['fcf9600af8c3eff355eb42466e9aac39'] = value # Comment - case 'f__2900_verw_langbez_': - administratorValues['f78d3c9e6800adbb8a9af0867cbdf3c7'] = value # Long Appellation - case 'f__2864_ort': - administratorValues['fecf6c9d7cbae513923e411178516378'] = value # Place - case 'f__290a_verw_kurzbez_': - administratorValues['fddaae99f4c6a835d9f9f195523c85f7'] = value # Short appellation - # Digitisation Process - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + administratorValues['f707e595ce7301d61c064e8e44c9c4f4'] = value # UUID + case 'f__vwra_vwr_adresse': + administratorValues['f303bbabf3d97536777b0f552d20bc7a'] = value # Address + case 'f__vwrn_vwr_dok_nr_': + administratorValues['f37e82c36b4fc6b275a1a86a389481e1'] = value # Administrator document number + case 'f__vwrb_verw_publ_bez': + administratorValues['ffc50ffbcc3f411ed63e3c6dfc6b4d80'] = value # Appellation in publication + case 'f__9990_kommentar': + administratorValues['fcf9600af8c3eff355eb42466e9aac39'] = value # Comment + case 'f__2900_verw_langbez_': + administratorValues['f78d3c9e6800adbb8a9af0867cbdf3c7'] = value # Long Appellation + case 'f__2864_ort': + administratorValues['fecf6c9d7cbae513923e411178516378'] = value # Place + case 'f__290a_verw_kurzbez_': + administratorValues['fddaae99f4c6a835d9f9f195523c85f7'] = value # Short appellation + # Digitisation Process + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Digitisation Process - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Set Digitisation Process - administratorValues['f3ec4640a87bd4534763af0fca050193'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + # Set Digitisation Process + administratorValues['f3ec4640a87bd4534763af0fca050193'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Material - administrator = Entity(api=api, fields=administratorValues, bundle_id='b4e5a6a31ff575ab09b07b5f27d322ab') # Administrator - api.save(administrator) + # Create Administrator + administrator = Entity(api=api, fields=administratorValues, bundle_id=bundleId) # Administrator + api.save(administrator) - print(f'Created administrator {index}: {administrator.uri}') + print(f'Created administrator {index}: {administrator.uri} of {len(administratorsTable)}') - # Write log - processedRows = processedRows._append({'administratorId': administratorValues['f37e82c36b4fc6b275a1a86a389481e1'][0], 'uuid': administratorValues['f707e595ce7301d61c064e8e44c9c4f4'][0], 'uri': administrator.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedAdministrators.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': administratorValues['f707e595ce7301d61c064e8e44c9c4f4'][0], 'uri': administrator.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing administrators') diff --git a/03_importAdministratorStatus.py b/03_importAdministratorStatus.py index 6b15413..d500c7f 100644 --- a/03_importAdministratorStatus.py +++ b/03_importAdministratorStatus.py @@ -5,84 +5,74 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importAdministratorStatus(api, engine): + print('Importing administrator statuses...') + tableName = 'c__ob28_status_verwalt_' + bundleId = 'b45447146729190da3a1d3e19165a6f8' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load sources table + administratorStatusTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processedAdministratorStatus.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) + # Create administratorStatuss + for index, row in administratorStatusTable.iterrows(): + administratorStatusValues = {} -# Load sources table -administratorStatusTable = pd.read_sql_table('c__ob28_status_verwalt_', con=engine) - -administratorStatusValues = {} - -# Create administratorStatuss -for index, row in administratorStatusTable.iterrows(): - # For every row in table... - if index < len(processedRows) and administratorStatusTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed administratorStatus {administratorStatusTable.iloc[index, 0]}') - continue - # Create Entity property dicts - administratorStatusValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # For every row in table... + if index < len(processedRows) and administratorStatusTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed administratorStatus {administratorStatusTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + administratorStatusValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'] = value # UUID - case 'f__290a_verw_kurzbez_': - administratorStatusValues['f08562a866d00cd5245c380c20e4e7f9'] = value # Admistrator short appellation - case 'f__2950_invent_nr_': - administratorStatusValues['f92ac041f6098335bf4075942a771ee3'] = value # Inventary - case 'f__2952_alte_i_nr_': - administratorStatusValues['fdc070143457df491f18347ac97b0f24'] = value # Old Identifier - case 'f__2864_ort': - administratorStatusValues['f9bc3796ceff9a3581bd8047545628b9'] = value # Place - case 'f__ob28_status_verwalt_': - administratorStatusValues['ff0265deb26c28f139345b89577b2539'] = value # Status - case 'f__2996_gelt_dauer': - administratorStatusValues['f3363962b4eaa4d38358bc1d2bda1a7f'] = value # Time-Span - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'] = value # UUID + case 'f__290a_verw_kurzbez_': + administratorStatusValues['f08562a866d00cd5245c380c20e4e7f9'] = value # Admistrator short appellation + case 'f__2950_invent_nr_': + administratorStatusValues['f92ac041f6098335bf4075942a771ee3'] = value # Inventary + case 'f__2952_alte_i_nr_': + administratorStatusValues['fdc070143457df491f18347ac97b0f24'] = value # Old Identifier + case 'f__2864_ort': + administratorStatusValues['f9bc3796ceff9a3581bd8047545628b9'] = value # Place + case 'f__ob28_status_verwalt_': + administratorStatusValues['ff0265deb26c28f139345b89577b2539'] = value # Status + case 'f__2996_gelt_dauer': + administratorStatusValues['f3363962b4eaa4d38358bc1d2bda1a7f'] = value # Time-Span + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - administratorStatus = Entity(api=api, fields=administratorStatusValues, bundle_id='b45447146729190da3a1d3e19165a6f8') - api.save(administratorStatus) + # Create Material + administratorStatus = Entity(api=api, fields=administratorStatusValues, bundle_id='b45447146729190da3a1d3e19165a6f8') + api.save(administratorStatus) - print(f'Created administratorStatus {index}: {administratorStatus.uri} of {len(administratorStatusTable)}') + print(f'Created administratorStatus {index}: {administratorStatus.uri} of {len(administratorStatusTable)}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'uuid': administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'][0], 'uri': administratorStatus.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedAdministratorStatus.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'][0], 'uri': administratorStatus.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing administrator statuses') diff --git a/03_importSource.py b/03_importSource.py index 1cc2c4d..ce0d248 100644 --- a/03_importSource.py +++ b/03_importSource.py @@ -5,124 +5,112 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importSource(api, engine): + print('Importing sources...') + tableName = 'c__que' + bundleId = 'b7dc57a93e008a58514b0d4ca26147b1' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id','sourceId', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load sources table + sourcesTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processedSources.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id','sourceId', 'uuid', 'uri']) -# Load sources table -sourcesTable = pd.read_sql_table('c__que', con=engine) -sourceValues = {} -digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - -# Create sources -for index, row in sourcesTable.iterrows(): - # For every row in table... - if index < processedRows['id'].max(): - # skip if already processed - print(f'Skipping already processed source {row['id']}') - continue - # Create Entity property dicts - sourceValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create sources + for index, row in sourcesTable.iterrows(): + sourceValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + # For every row in table... + if index < len(processedRows) and sourcesTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f"Skipping already processed source {sourcesTable.loc[index, 'id']}") continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + sourceValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - sourceValues['f9f02815a5631a85948d4d258a455f49'] = value # UUID - case 'f__9990_kommentar': - sourceValues['f89a563b07f965ca2dcb0b1bd178e863'] = value # Comment - case 'f__8080_verfasser': - sourceValues['f2d2934a6c72b5552f01042338ff5d67'] = value # Creator - case 'f__80bs_que__beschr_': - sourceValues['fd2122de6bcd62c61fcb7a9223baa20f'] = value # Description - case 'f__80bw_que__bewertung': - sourceValues['f70a7818de6e31eacea22148c92737ac'] = value # Evalutation - case 'f__8182_transkr__extern': - sourceValues['f409a3ea352d6bc55c27f6a93d239191'] = value # External Transkript - case 'f__2950_invent_nr_': - sourceValues['f71605f258ceb37ee5fcf2cd7871de2c'] = value # Inventary number - case 'f__2900_verw_langbez_': - sourceValues['f19d275cd6f48ef64d104997ca99291d'] = value # Long appellation administrator - case 'f__8540_repro_nr_': - sourceValues['f881dd5566725dc26a8b25cfba181792'] = value # Reproduction Number - case 'f__290a_verw_kurzbez_': - sourceValues['f343d954f8d95f1da98201a7f29ac81f'] = value # Short appellation Administrator - case 'f__8130_que_kurzt_': - sourceValues['f3faea3691516939fc4b0c2149ee2e5b'] = value # Shorttitle - case 'f__8000_que_dok_nr_': - sourceValues['f50ad6021b42c094f7e551faec831802'] = value # Source Document Identifier - case 'f__8092_untertitel': - sourceValues['fb734bd50628353b7b5c0bfc88f2cbdc'] = value # Subtitle - case 'f__80fp_vorhanden_als': - sourceValues['fd7b99a3db6191382401d69710ac192f'] = value # There as - case 'f__8090_titel': - sourceValues['f399332f583d268f07200efd1e3bb3c5'] = value # Title - case 'f__8180_transkript_': - sourceValues['f6585008a698902f45dc2a79b9a3a9de'] = value # Transcript - case 'f__8060_art': - sourceValues['f38c664e4f9b2effc83ebc50e1244442'] = value # Type - case 'f__2990_verbleib': - sourceValues['fae3bc551d146652898782f712f95749'] = value # Whereabouts - # Digitisation Process - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + sourceValues['f9f02815a5631a85948d4d258a455f49'] = value # UUID + case 'f__9990_kommentar': + sourceValues['f89a563b07f965ca2dcb0b1bd178e863'] = value # Comment + case 'f__8080_verfasser': + sourceValues['f2d2934a6c72b5552f01042338ff5d67'] = value # Creator + case 'f__80bs_que__beschr_': + sourceValues['fd2122de6bcd62c61fcb7a9223baa20f'] = value # Description + case 'f__80bw_que__bewertung': + sourceValues['f70a7818de6e31eacea22148c92737ac'] = value # Evalutation + case 'f__8182_transkr__extern': + sourceValues['f409a3ea352d6bc55c27f6a93d239191'] = value # External Transkript + case 'f__2950_invent_nr_': + sourceValues['f71605f258ceb37ee5fcf2cd7871de2c'] = value # Inventary number + case 'f__2900_verw_langbez_': + sourceValues['f19d275cd6f48ef64d104997ca99291d'] = value # Long appellation administrator + case 'f__8540_repro_nr_': + sourceValues['f881dd5566725dc26a8b25cfba181792'] = value # Reproduction Number + case 'f__290a_verw_kurzbez_': + sourceValues['f343d954f8d95f1da98201a7f29ac81f'] = value # Short appellation Administrator + case 'f__8130_que_kurzt_': + sourceValues['f3faea3691516939fc4b0c2149ee2e5b'] = value # Shorttitle + case 'f__8000_que_dok_nr_': + sourceValues['f50ad6021b42c094f7e551faec831802'] = value # Source Document Identifier + case 'f__8092_untertitel': + sourceValues['fb734bd50628353b7b5c0bfc88f2cbdc'] = value # Subtitle + case 'f__80fp_vorhanden_als': + sourceValues['fd7b99a3db6191382401d69710ac192f'] = value # There as + case 'f__8090_titel': + sourceValues['f399332f583d268f07200efd1e3bb3c5'] = value # Title + case 'f__8180_transkript_': + sourceValues['f6585008a698902f45dc2a79b9a3a9de'] = value # Transcript + case 'f__8060_art': + sourceValues['f38c664e4f9b2effc83ebc50e1244442'] = value # Type + case 'f__2990_verbleib': + sourceValues['fae3bc551d146652898782f712f95749'] = value # Whereabouts + # Digitisation Process + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Digitisation Process - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Set Digitisation Process - sourceValues['ffdf27e75013fa55d31f728ff5166f06'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + # Set Digitisation Process + sourceValues['ffdf27e75013fa55d31f728ff5166f06'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Material - source = Entity(api=api, fields=sourceValues, bundle_id='b7dc57a93e008a58514b0d4ca26147b1') - api.save(source) + # Create Material + source = Entity(api=api, fields=sourceValues, bundle_id=bundleId) + api.save(source) - print(f'Created source {index}: {source.uri}') + print(f'Created source {index}: {source.uri} of {len(sourcesTable)}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'sourceId': sourceValues['f50ad6021b42c094f7e551faec831802'][0], 'uuid': sourceValues['f9f02815a5631a85948d4d258a455f49'][0], 'uri': source.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedSources.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'sourceId': sourceValues['f50ad6021b42c094f7e551faec831802'][0], 'uuid': sourceValues['f9f02815a5631a85948d4d258a455f49'][0], 'uri': source.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/04_importArtistSourceReferenceAssignment.py b/04_importArtistSourceReferenceAssignment.py index 9806373..576caa6 100644 --- a/04_importArtistSourceReferenceAssignment.py +++ b/04_importArtistSourceReferenceAssignment.py @@ -5,83 +5,70 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistSourceReferenceAssignment(api, engine): + print('Importing artist source reference assignments...') -# Load the environment variables -load_dotenv() + tableName = "c__81kr_que_kt_kue" + bundleId = 'bf71940d0b18c20511e2141159afb9de' # Artist source reference assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__81kr_que_kt_kue" -bundleId = 'bf71940d0b18c20511e2141159afb9de' # Artist source reference assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + entityValues = {} + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - entityValues['fe3139ac03bd854ac9196fc240e7c68b'] = value # UUID - fUuid = value[0] - case 'f__8134_stelle': - entityValues['f58c13c5502baef24ede2a8a977ae6c6'] = value # Source reference - case 'f__81kr_que_kt_kue': - entityValues['f14d2d19f879d7398a384bdc132921a3'] = value # Source short title + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + continue + case 'f__uuid': + entityValues['fe3139ac03bd854ac9196fc240e7c68b'] = value # UUID + fUuid = value[0] + case 'f__8134_stelle': + entityValues['f58c13c5502baef24ede2a8a977ae6c6'] = value # Source reference + case 'f__81kr_que_kt_kue': + entityValues['f14d2d19f879d7398a384bdc132921a3'] = value # Source short title - case _: - print(f'{key} is not a valid field, skipping.') + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(tableName)}') + print(f'Created entity {index}: {entity.uri} of {len(tableName)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/04_importMarks.py b/04_importMarks.py index c207cca..f06cbb5 100644 --- a/04_importMarks.py +++ b/04_importMarks.py @@ -5,268 +5,255 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarks(api, engine): + print('Importing marks...') -# Load the environment variables -load_dotenv() + tableName = 'c__mar' + bundleId = 'b2c4e1c984d7758d7c7ec719110f7125' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'markId', 'uuid', 'uri']) -# Simple log + # Load mark table + sqlTable = pd.read_sql_table(tableName, con=engine) + print(f'Processing {len(sqlTable)} marks...') -try: - processedRows = pd.read_csv(f'./logs/processedMarks.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'markId', 'uuid', 'uri']) - -# Load mark table -markTable = pd.read_sql_table('c__mar', con=engine) -print(f'Processing {len(markTable)} marks...') - -# Create mark -for index, row in markTable.iterrows(): - # For every row in table... - if index < processedRows['id'].max(): - # skip if already processed - print(f'Skipping already processed mark {row['id']}') - continue - # Create Entity property dicts - markValues = {} - creationValues = {} - digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - dimensionValues = {} - featureValues = {} - featureDimensionValues = {} - imageValues = {} - imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} - - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create mark + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed mark {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification - # for nested semantics, because we need to be efficient. - match key: - case 'id': + # Create Entity property dicts + markValues = {} + creationValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + dimensionValues = {} + featureValues = {} + featureDimensionValues = {} + imageValues = {} + imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} + + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - markValues['fb40b199b4032e55acc152f994e93b45'] = value # UUID - case 'f__3002_pub_kue_nr_': - markValues['f6f0572ebec9c98e164d0e9aa0650c2e'] = value # Artist Number - case 'f__6700_mar_dok_nr_': - markValues['fe577970c02f173170ff3848a36b3b79'] = value # Mark Document Number - case 'f__6770_rosenb_nr_': - markValues['f6fc4b5726c97bad8b03ede860491649'] = value # Rosenberg Number - case 'f__9990_kommentar': - markValues['f01e527e707ff36bf966baa01c163378'] = value # Comment - case 'f__68an_abdruck_nr_': - markValues['f8324ea3c9ee378f1e19035e092aadb9'] = value # Print Number - case 'f__68nk_besonderheiten': - markValues['fa21e323a8a7a99ce3489e1f7753ac5f'] = value # Special Features - case 'f__8470_aufnahmenr_': - markValues['f67031e2a2b81ad9f318dc5b11d5a6af'] = value # Recording number - case 'f__684b_breite_marke': - # We map dimensions to Dimension entity. - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification + # for nested semantics, because we need to be efficient. + match key: + case 'id': + continue + case 'f__uuid': + markValues['fb40b199b4032e55acc152f994e93b45'] = value # UUID + case 'f__3002_pub_kue_nr_': + markValues['f6f0572ebec9c98e164d0e9aa0650c2e'] = value # Artist Number + case 'f__6700_mar_dok_nr_': + markValues['fe577970c02f173170ff3848a36b3b79'] = value # Mark Document Number + case 'f__6770_rosenb_nr_': + markValues['f6fc4b5726c97bad8b03ede860491649'] = value # Rosenberg Number + case 'f__9990_kommentar': + markValues['f01e527e707ff36bf966baa01c163378'] = value # Comment + case 'f__68an_abdruck_nr_': + markValues['f8324ea3c9ee378f1e19035e092aadb9'] = value # Print Number + case 'f__68nk_besonderheiten': + markValues['fa21e323a8a7a99ce3489e1f7753ac5f'] = value # Special Features + case 'f__8470_aufnahmenr_': + markValues['f67031e2a2b81ad9f318dc5b11d5a6af'] = value # Recording number + case 'f__684b_breite_marke': + # We map dimensions to Dimension entity. + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__684h_hoehe_marke': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['hight'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__68na_bz_breite_hoehe': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width_x_hight'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__6840_rahmenform': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['frame_shape'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__684d_darst__marke': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['design'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__684l_text_marke': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['text'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nb_randanschluss': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['edge_connection'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nc_form_haste': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['haste_mould'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nd_form_schraegstr_': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['slash_form_shape'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68ne_haste_schraegstr_': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nf_n_knick': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash_kink'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68ng_ueberg__serifen': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = [ - 'transition_serif_haste'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nh_dicke_ser__max_': - # We map (features) dimensions to Dimension entity. - featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['maximum_thickness'] # Type - featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__68ni_dicke_ser__min': - # We map features to Feature entity. - featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = [ - 'minimum_thickness'] # Type - featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__68nj_breite_serife': - # We map features to Feature entity. - featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = [ - 'width'] # Type - featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__8540_repro_nr_': - # We map images to Image entity - for item in value: - if item is not None: - # Replace dir paths in name - item = item.replace('Objekte\\', 'objects/') - item = item.replace('Objekte3\\', 'objects/') - item = item.replace('Objekte4\\', 'objects/') - item = item.replace('Objekte5\\', 'objects/') - item = item.replace('objekte5\\', 'objects/') - item = item.replace('Marken\\', 'marks/') - item = item.replace('Marken/', 'marks/') - item = item.replace('MArken\\', 'marks/') - item = item.replace('Goldschmiede/', 'goldsmiths/') - item = item.replace('Goldschmiede\\', 'goldsmiths/') - item = item.replace('Epitaphien/', 'epitaphies/') - item = item.replace('Epitaphien\\', 'epitaphies/') - imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) - imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File - imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case 'f__ptxt_plug_in_text': - markValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text - case _: - print(f'{key} is not a valid field, skipping.') + case 'f__684h_hoehe_marke': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['hight'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__68na_bz_breite_hoehe': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width_x_hight'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__6840_rahmenform': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['frame_shape'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__684d_darst__marke': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['design'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__684l_text_marke': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['text'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nb_randanschluss': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['edge_connection'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nc_form_haste': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['haste_mould'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nd_form_schraegstr_': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['slash_form_shape'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68ne_haste_schraegstr_': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nf_n_knick': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash_kink'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68ng_ueberg__serifen': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = [ + 'transition_serif_haste'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nh_dicke_ser__max_': + # We map (features) dimensions to Dimension entity. + featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['maximum_thickness'] # Type + featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__68ni_dicke_ser__min': + # We map features to Feature entity. + featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = [ + 'minimum_thickness'] # Type + featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__68nj_breite_serife': + # We map features to Feature entity. + featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = [ + 'width'] # Type + featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__8540_repro_nr_': + # We map images to Image entity + for item in value: + if item is not None: + # Replace dir paths in name + item = item.replace('Objekte\\', 'objects/') + item = item.replace('Objekte3\\', 'objects/') + item = item.replace('Objekte4\\', 'objects/') + item = item.replace('objekte4\\', 'objects/') + item = item.replace('Objekte5\\', 'objects/') + item = item.replace('objekte5\\', 'objects/') + item = item.replace('Marken\\', 'marks/') + item = item.replace('Marken/', 'marks/') + item = item.replace('MArken\\', 'marks/') + item = item.replace('Goldschmiede/', 'goldsmiths/') + item = item.replace('Goldschmiede\\', 'goldsmiths/') + item = item.replace('Epitaphien/', 'epitaphies/') + item = item.replace('Epitaphien\\', 'epitaphies/') + imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) + imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File + imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case 'f__ptxt_plug_in_text': + markValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text + case _: + print(f'{key} is not a valid field, skipping.') - # Create Dimension entities and add their UUIDs to a list - # because we link Mark and Dimension over the UUID - dimension = [] - for key, value in dimensionValues.items(): - if value: - dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') - api.save(dimensionItem) - dimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) + # Create Dimension entities and add their UUIDs to a list + # because we link Mark and Dimension over the UUID + dimension = [] + for key, value in dimensionValues.items(): + if value: + dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') + api.save(dimensionItem) + dimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) - # Create (feature) Dimension entities and add their UUIDs to a list - # because we link Feature and its Dimension over the UUID - featureDimension = [] - for key, value in featureDimensionValues.items(): - if value: - featureDimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') # Dimension Bundle - api.save(featureDimensionItem) - featureDimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) # Dimension UUID + # Create (feature) Dimension entities and add their UUIDs to a list + # because we link Feature and its Dimension over the UUID + featureDimension = [] + for key, value in featureDimensionValues.items(): + if value: + featureDimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') # Dimension Bundle + api.save(featureDimensionItem) + featureDimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) # Dimension UUID - # Add the serif feature t the feature list - if featureDimension: - featureValues.setdefault('serif', {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['serif'] # Feature Type - featureValues['serif']['f0f825f5d3a6f0e2d67eee311b94cd6f'] = featureDimension # Dimension UUIDs - featureValues['serif']['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + # Add the serif feature t the feature list + if featureDimension: + featureValues.setdefault('serif', {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['serif'] # Feature Type + featureValues['serif']['f0f825f5d3a6f0e2d67eee311b94cd6f'] = featureDimension # Dimension UUIDs + featureValues['serif']['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - # Create Dimension entities and add their UUIDs to a list - # because we link Mark and Dimension over the UUID - feature = [] - for key, value in featureValues.items(): - if value: - featureItem = Entity(api=api, fields=value, bundle_id='b393e1c3db202fbb7a8b54e65eb38227') # Feature Bundle - api.save(featureItem) - feature.append(value['f299e2a145b508e376f2bf2e44cbe219'][0]) # Feature UUID + # Create Dimension entities and add their UUIDs to a list + # because we link Mark and Dimension over the UUID + feature = [] + for key, value in featureValues.items(): + if value: + featureItem = Entity(api=api, fields=value, bundle_id='b393e1c3db202fbb7a8b54e65eb38227') # Feature Bundle + api.save(featureItem) + feature.append(value['f299e2a145b508e376f2bf2e44cbe219'][0]) # Feature UUID - # Create Image entities and add their UUIDs to a list - # because we link Image Assignment and Image over the UUID - imageList = [] - for key, value in imageValues.items(): - if value: - imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') - api.save(imageItem) - imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) + # Create Image entities and add their UUIDs to a list + # because we link Image Assignment and Image over the UUID + imageList = [] + for key, value in imageValues.items(): + if value: + imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') + api.save(imageItem) + imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) - # Create Image Assignment entities and add their UUIDs to a list - # because we link Artifact and Image Assignment over the UUID - if imageList: - imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs - imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') - api.save(imageAssignment) + # Create Image Assignment entities and add their UUIDs to a list + # because we link Artifact and Image Assignment over the UUID + if imageList: + imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs + imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') + api.save(imageAssignment) - # Create Digitisation Process - if digitisationProcessValues: - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + if digitisationProcessValues: + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Add the field values for reference - if dimension: - markValues['f05807c9d81cd39b814f83de0175d66a'] = dimension # Dimension - if feature: - markValues['f3ce49288bc03e9d799f20ea277429db'] = feature # Feature - if imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]: - markValues['f73e27498813a922032b18b3f3ab8d10'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment - if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: - markValues['f3baf98f752fc9638de175985183119a'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + # Add the field values for reference + if dimension: + markValues['f05807c9d81cd39b814f83de0175d66a'] = dimension # Dimension + if feature: + markValues['f3ce49288bc03e9d799f20ea277429db'] = feature # Feature + if imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]: + markValues['f73e27498813a922032b18b3f3ab8d10'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment + if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: + markValues['f3baf98f752fc9638de175985183119a'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Mark - mark = Entity(api=api, fields=markValues, bundle_id='b2c4e1c984d7758d7c7ec719110f7125') - api.save(mark) + # Create Mark + mark = Entity(api=api, fields=markValues, bundle_id=bundleId) + api.save(mark) - print(f'Created mark number {index}: {mark.uri} of {len(markTable)}') + print(f'Created mark number {index}: {mark.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'markId': markValues['fe577970c02f173170ff3848a36b3b79'][0], 'uuid': markValues['fb40b199b4032e55acc152f994e93b45'][0], 'uri': mark.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedMarks.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'markId': markValues['fe577970c02f173170ff3848a36b3b79'][0], 'uuid': markValues['fb40b199b4032e55acc152f994e93b45'][0], 'uri': mark.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing marks') diff --git a/04_importSourceReferenceAssignment.py b/04_importSourceReferenceAssignment.py index b6eecf0..a1e0c3b 100644 --- a/04_importSourceReferenceAssignment.py +++ b/04_importSourceReferenceAssignment.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importSourceReferenceAssignment(api, engine): + print('Importing source reference assignments...') -# Load the environment variables -load_dotenv() + tableName = "c__8130_que_kurzt_" + bundleId = 'b3c4232e84c2f39795bd602f152ed6f0' # Source reference assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__8130_que_kurzt_" -bundleId = 'b3c4232e84c2f39795bd602f152ed6f0' # Source reference assignment -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - entityValues['fbe74fcb0ab0ce5a0181467b9b07e12e'] = value # UUID - fUuid = value[0] - case 'f__8134_stelle': - entityValues['f769795b4fd628d01692dd4516322db4'] = value # Source reference - case 'f__8130_que_kurzt_': - entityValues['f3e841bf3b4e91716d1ff5b83bf293d9'] = value # Source short title + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + continue + case 'f__uuid': + entityValues['fbe74fcb0ab0ce5a0181467b9b07e12e'] = value # UUID + fUuid = value[0] + case 'f__8134_stelle': + entityValues['f769795b4fd628d01692dd4516322db4'] = value # Source reference + case 'f__8130_que_kurzt_': + entityValues['f3e841bf3b4e91716d1ff5b83bf293d9'] = value # Source short title - case _: - print(f'{key} is not a valid field, skipping.') + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(tableName)}') + print(f'Created source reference assignment {index}: {entity.uri} of {len(tableName)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + print('finished importing source reference assignments') diff --git a/05_importArtist.py b/05_importArtist.py index a88ab14..e9bfff3 100644 --- a/05_importArtist.py +++ b/05_importArtist.py @@ -5,165 +5,154 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtist(api, engine): + print('Importing artists...') -# Load the environment variables -load_dotenv() + tableName = 'c__kue' + bundleId = 'bc322be33491dacc600dd43fdee09a5c' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + test = False -test = True + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processedArtists.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['artistId', 'uuid', 'uri']) + # Load sources table + artistsTable = pd.read_sql_table(tableName, con=engine) -# Load sources table -artistsTable = pd.read_sql_table('c__kue', con=engine) + # Create artists + for index, row in artistsTable.iterrows(): -artistValues = {} -digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} -imageValues = {} -reproNumberAssignmentValues = {'fac4426c096e7f8f44bb0e11b8394952': [str(uuid.uuid4())]} - -# Create artists -for index, row in artistsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and artistsTable.loc[index, 'f__3000_kue_dok_nr_'] == processedRows.loc[index, 'artistId']: - # skip if already processed - print(f'Skipping already processed artist {artistsTable.loc[index, "f__3000_kue_dok_nr_"]}') - continue - # Create Entity property dicts - artistValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # For every row in table... + if index < len(processedRows) and artistsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed artist {artistsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + artistValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + imageValues = {} + reproNumberAssignmentValues = {'fac4426c096e7f8f44bb0e11b8394952': [str(uuid.uuid4())]} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - artistValues['fff2eb2283e4cd8df3783602a1bc96ab'] = value # UUID - case 'f__3170_and__taetigkeit': - artistValues['f01f51e385e5f206653e029ff5c845c4'] = value # Alternate occupation - case 'f__3000_kue_dok_nr_': - artistValues['f61deac361ac5e0731edbf214761d15c'] = value # Artist Document Number - case 'f__3002_pub_kue_nr_': - artistValues['f46b2ec14ce05d2618427c526198d64e'] = value # Artist published number - case 'f__9990_kommentar': - artistValues['fedc08e4225ac800e5d9f16bf345d181'] = value # Comment - case 'f__3360_letzte_erw_': - artistValues['f1419788b918f4c4a13393fd09ff37b3'] = value # Last Mentioned - case 'f__6700_mar_dok_nr_': - artistValues['f3d63eec34c00556cbadf635f78d815a'] = value # Mark Assignment - case 'f__33gs_meister_als': - artistValues['f30b60be791fb13f919c31510ca4de50'] = value # Master Education - case 'f__33mj_meisterjahr': - artistValues['fd2d07bb9ea1eadacdf28e41cacb92c1'] = value # Master Year - case 'f__3100_name': - artistValues['f71c047dad23083850a13d489386bf31'] = value # Name - case 'f__3105_abw_schreibw_': - artistValues['fbe84024bf9fad8f6a545b3af75d8b1b'] = value # Name Variants - case 'f__3166_fakt__taetig_als': - artistValues['fb0373e9fd949984cf9c09ec1ea0746c'] = value # Occupation - case 'f__336p_1__posth__erw_': - artistValues['fe079424bb6196d4a9721f84c43361f8'] = value # Posthumous Mentioned - case 'f__8540_repro_nr_': - # We map images to Image entity - for item in value: - if item is not None: - # Replace dir paths in name - item = item.replace('Objekte\\', 'objects/') - item = item.replace('Objekte3\\', 'objects/') - item = item.replace('Objekte4\\', 'objects/') - item = item.replace('Objekte5\\', 'objects/') - item = item.replace('objekte5\\', 'objects/') - item = item.replace('Marken\\', 'marks/') - item = item.replace('Marken/', 'marks/') - item = item.replace('MArken\\', 'marks/') - item = item.replace('Goldschmiede/', 'goldsmiths/') - item = item.replace('Goldschmiede\\', 'goldsmiths/') - item = item.replace('Epitaphien/', 'epitaphies/') - item = item.replace('Epitaphien\\', 'epitaphies/') - imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) - imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artist_images/' + item + '.jpg'] # File - imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID - case 'f__6770_rosenb_nr_': - artistValues['f82ed1dc96df9230e28e04fef0ff2305'] = value # Rosenberg number - # Digitisation Process - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + artistValues['fff2eb2283e4cd8df3783602a1bc96ab'] = value # UUID + case 'f__3170_and__taetigkeit': + artistValues['f01f51e385e5f206653e029ff5c845c4'] = value # Alternate occupation + case 'f__3000_kue_dok_nr_': + artistValues['f61deac361ac5e0731edbf214761d15c'] = value # Artist Document Number + case 'f__3002_pub_kue_nr_': + artistValues['f46b2ec14ce05d2618427c526198d64e'] = value # Artist published number + case 'f__9990_kommentar': + artistValues['fedc08e4225ac800e5d9f16bf345d181'] = value # Comment + case 'f__3360_letzte_erw_': + artistValues['f1419788b918f4c4a13393fd09ff37b3'] = value # Last Mentioned + case 'f__6700_mar_dok_nr_': + artistValues['f3d63eec34c00556cbadf635f78d815a'] = value # Mark Assignment + case 'f__33gs_meister_als': + artistValues['f30b60be791fb13f919c31510ca4de50'] = value # Master Education + case 'f__33mj_meisterjahr': + artistValues['fd2d07bb9ea1eadacdf28e41cacb92c1'] = value # Master Year + case 'f__3100_name': + artistValues['f71c047dad23083850a13d489386bf31'] = value # Name + case 'f__3105_abw_schreibw_': + artistValues['fbe84024bf9fad8f6a545b3af75d8b1b'] = value # Name Variants + case 'f__3166_fakt__taetig_als': + artistValues['fb0373e9fd949984cf9c09ec1ea0746c'] = value # Occupation + case 'f__336p_1__posth__erw_': + artistValues['fe079424bb6196d4a9721f84c43361f8'] = value # Posthumous Mentioned + case 'f__8540_repro_nr_': + # We map images to Image entity + for item in value: + if item is not None: + # Replace dir paths in name + item = item.replace('Objekte\\', 'objects/') + item = item.replace('Objekte3\\', 'objects/') + item = item.replace('Objekte4\\', 'objects/') + item = item.replace('Objekte5\\', 'objects/') + item = item.replace('objekte5\\', 'objects/') + item = item.replace('Marken\\', 'marks/') + item = item.replace('Marken/', 'marks/') + item = item.replace('MArken\\', 'marks/') + item = item.replace('Goldschmiede/', 'goldsmiths/') + item = item.replace('Goldschmiede\\', 'goldsmiths/') + item = item.replace('Epitaphien/', 'epitaphies/') + item = item.replace('Epitaphien\\', 'epitaphies/') + imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) + imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artist_images/' + item + '.jpg'] # File + imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID + case 'f__6770_rosenb_nr_': + artistValues['f82ed1dc96df9230e28e04fef0ff2305'] = value # Rosenberg number + # Digitisation Process + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Digitisation Process - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Create Image entities and add their UUIDs to a list - # because we link Image Assignment and Image over the UUID - imageList = [] - for key, value in imageValues.items(): - if value: - imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') - api.save(imageItem) - imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) + # Create Image entities and add their UUIDs to a list + # because we link Image Assignment and Image over the UUID + imageList = [] + for key, value in imageValues.items(): + if value: + imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') + api.save(imageItem) + imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) # add UUID to list - # Create Image Assignment entities and add their UUIDs to a list - # because we link Artist and Image Assignment over the UUID - if imageList: - reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = imageList # List of Image UUIDs - reproNumberAssignment = Entity(api=api, fields=reproNumberAssignmentValues, bundle_id='bdc233b242374a41b5e6923eee937fe9') - api.save(reproNumberAssignment) + # Create Image Assignment entities and add their UUIDs to a list + # because we link Artist and Image Assignment over the UUID + if imageList: + reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = imageList # List of Image UUIDs + reproNumberAssignment = Entity(api=api, fields=reproNumberAssignmentValues, bundle_id='bdc233b242374a41b5e6923eee937fe9') + api.save(reproNumberAssignment) + else: + reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = [] - if reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'][0]: - artistValues['f42deb039d8d4f47877892af005a1ef9'] = [reproNumberAssignmentValues['fac4426c096e7f8f44bb0e11b8394952'][0]] # Image Assignment - if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: - artistValues['f6c2b79f1ba142bb62f83b2c4d805e49'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + if reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44']: + artistValues['f42deb039d8d4f47877892af005a1ef9'] = [reproNumberAssignmentValues['fac4426c096e7f8f44bb0e11b8394952'][0]] # Image Assignment + if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: + artistValues['f6c2b79f1ba142bb62f83b2c4d805e49'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Material - artist = Entity(api=api, fields=artistValues, bundle_id='bc322be33491dacc600dd43fdee09a5c') - api.save(artist) + # Create Material + artist = Entity(api=api, fields=artistValues, bundle_id=bundleId) + api.save(artist) - print(f'Created artist {index}: {artist.uri} of {len(artistsTable)}') + print(f'Created artist {index}: {artist.uri} of {len(artistsTable)}') - # Write log - processedRows = processedRows._append({'artistId': artistValues['f61deac361ac5e0731edbf214761d15c'][0], 'uuid': artistValues['fff2eb2283e4cd8df3783602a1bc96ab'][0], 'uri': artist.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedArtists.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': artistValues['fff2eb2283e4cd8df3783602a1bc96ab'][0], 'uri': artist.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) - if test: - print('Testing mode activated. Exiting.') - exit() + if test: + print('Testing mode activated. Exiting.') + exit() -print('finish') + print('finished importing artists') diff --git a/06_importLiterature.py b/06_importLiterature.py index c7e85a9..8d96614 100644 --- a/06_importLiterature.py +++ b/06_importLiterature.py @@ -5,122 +5,108 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importLiterature(api, engine): + print('Importing literature...') + tableName = 'c__lit' + bundleId = 'bafe9c3d3b640d4d1a16b104f367ac91' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load sources table + literaturesTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processedLiteratures.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'literatureId', 'uuid', 'uri']) -# Load sources table -literaturesTable = pd.read_sql_table('c__lit', con=engine) - -literatureValues = {} -digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - -# Create literatures -for index, row in literaturesTable.iterrows(): - # For every row in table... - if index < len(processedRows) and literaturesTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed literature {literaturesTable.iloc[index, 0]}') - continue - # Create Entity property dicts - literatureValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create literatures + for index, row in literaturesTable.iterrows(): + # For every row in table... + if index < len(processedRows) and literaturesTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed literature {literaturesTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + literatureValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - literatureValues['fd58e0884f7cf63f8436c2789fcd2745'] = value # UUID - case 'f__9990_kommentar': - literatureValues['f3208633f7767cc9f5e44e768818df20'] = value # Comment - case 'f__8270_verfasser': - literatureValues['f60a88060c75068b4bf2eefd5221793f'] = value # Creator - case 'f__8324_ersch_jahr': - literatureValues['fdae7bd743ae58bf623feca3a26bcf6c'] = value # Date - case 'f__8280_hrsg': - literatureValues['fd0bc706876adee304892f8f9e34567f'] = value # Editor - case 'f__8346_signatur': - literatureValues['fb434c214be21f7e82a851d6524c2850'] = value # Identifier - case 'f__9970_schlagwort': - literatureValues['f1a55055944adf5d4e866a1768633a7f'] = value # Keyword - case 'f__8200_lit_dok_nr_': - literatureValues['f3bdd54b9ea5808a571200e9c60e103e'] = value # Literature Document Identifier - case 'f__9971_sw_goldschmied': - literatureValues['f21a286fec5d48ea238c10877ee2b0db'] = value # Mentioned Actor - case 'f__8308_bibl_zusatz': - literatureValues['f1674a743a13a3d74b0c6ebb2cf0043f'] = value # Note - case 'f__8319_seitenangabe': - literatureValues['f0d1716a40498f52abd4a6522aa5f3ef'] = value # Pages - case 'f__8320_ersch_ort': - literatureValues['fc3cafc0f542cef2a0e1189873ff58a3'] = value # Publication Place - case 'f__8300_serientitel': - literatureValues['f660f34eb7091c1b0f4b492e49a0e71b'] = value # Series Title - case 'f__8330_lit_kurzt_': - literatureValues['f84416d4380cdd30e8b9fcea57f58957'] = value # Shorttitle - case 'f__8307_titelzusatz': - literatureValues['f8521679ac8f6441ddb086f1c5ed7528'] = value # Subtitle - case 'f__8290_titel': - literatureValues['fa1ae40cc9940569d5a1e3ea13e33488'] = value # Title - case 'f__8260_art': - literatureValues['f92c6453d265a952a56252e7d93cedea'] = value # Type - # Digitisation Process - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + literatureValues['fd58e0884f7cf63f8436c2789fcd2745'] = value # UUID + case 'f__9990_kommentar': + literatureValues['f3208633f7767cc9f5e44e768818df20'] = value # Comment + case 'f__8270_verfasser': + literatureValues['f60a88060c75068b4bf2eefd5221793f'] = value # Creator + case 'f__8324_ersch_jahr': + literatureValues['fdae7bd743ae58bf623feca3a26bcf6c'] = value # Date + case 'f__8280_hrsg': + literatureValues['fd0bc706876adee304892f8f9e34567f'] = value # Editor + case 'f__8346_signatur': + literatureValues['fb434c214be21f7e82a851d6524c2850'] = value # Identifier + case 'f__9970_schlagwort': + literatureValues['f1a55055944adf5d4e866a1768633a7f'] = value # Keyword + case 'f__8200_lit_dok_nr_': + literatureValues['f3bdd54b9ea5808a571200e9c60e103e'] = value # Literature Document Identifier + case 'f__9971_sw_goldschmied': + literatureValues['f21a286fec5d48ea238c10877ee2b0db'] = value # Mentioned Actor + case 'f__8308_bibl_zusatz': + literatureValues['f1674a743a13a3d74b0c6ebb2cf0043f'] = value # Note + case 'f__8319_seitenangabe': + literatureValues['f0d1716a40498f52abd4a6522aa5f3ef'] = value # Pages + case 'f__8320_ersch_ort': + literatureValues['fc3cafc0f542cef2a0e1189873ff58a3'] = value # Publication Place + case 'f__8300_serientitel': + literatureValues['f660f34eb7091c1b0f4b492e49a0e71b'] = value # Series Title + case 'f__8330_lit_kurzt_': + literatureValues['f84416d4380cdd30e8b9fcea57f58957'] = value # Shorttitle + case 'f__8307_titelzusatz': + literatureValues['f8521679ac8f6441ddb086f1c5ed7528'] = value # Subtitle + case 'f__8290_titel': + literatureValues['fa1ae40cc9940569d5a1e3ea13e33488'] = value # Title + case 'f__8260_art': + literatureValues['f92c6453d265a952a56252e7d93cedea'] = value # Type + # Digitisation Process + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Digitisation Process - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Set Digitisation Process - literatureValues['f59a2ad5cce3e51f172215ea88afac41'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + # Set Digitisation Process + literatureValues['f59a2ad5cce3e51f172215ea88afac41'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Material - literature = Entity(api=api, fields=literatureValues, bundle_id='bafe9c3d3b640d4d1a16b104f367ac91') - api.save(literature) + # Create Material + literature = Entity(api=api, fields=literatureValues, bundle_id='bafe9c3d3b640d4d1a16b104f367ac91') + api.save(literature) - print(f'Created literature {index}: {literature.uri} of {len(literaturesTable)}') + print(f'Created literature {index}: {literature.uri} of {len(literaturesTable)}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'literatureId': literatureValues['f3bdd54b9ea5808a571200e9c60e103e'][0], 'uuid': literatureValues['fd58e0884f7cf63f8436c2789fcd2745'][0], 'uri': literature.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedLiteratures.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'docId': literatureValues['f3bdd54b9ea5808a571200e9c60e103e'][0], 'uuid': literatureValues['fd58e0884f7cf63f8436c2789fcd2745'][0], 'uri': literature.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/07_importInspectionMark.py b/07_importInspectionMark.py index 100eb82..83c86c2 100644 --- a/07_importInspectionMark.py +++ b/07_importInspectionMark.py @@ -5,197 +5,182 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMark(api, engine): + print('Importing inspection marks...') + tableName = 'c__bez' + bundleId = 'baad021dfda9b89d5ba407dd0fca0d03' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load inspectionMark table + inspectionMarkTable = pd.read_sql_table('c__bez', con=engine) -# Simple log - -try: - processedRows = pd.read_csv(f'./logs/processedInspectionMarks.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'inspectionMarkId', 'uuid', 'uri']) - -# Load inspectionMark table -inspectionMarkTable = pd.read_sql_table('c__bez', con=engine) - -# Create inspectionMark -for index, row in inspectionMarkTable.iterrows(): - # For every row in table... - if index < len(processedRows) and inspectionMarkTable.loc[index, 'id'] == processedRows.loc[index, 'id']: - # skip if already processed - print(f'Skipping already processed inspectionMark {inspectionMarkTable.loc[index, "id"]}') - continue - # Create Entity property dicts - inspectionMarkValues = {} - creationValues = {} - digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - dimensionValues = {} - featureValues = {} - featureDimensionValues = {} - imageValues = {} - imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} - - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create inspectionMark + for index, row in inspectionMarkTable.iterrows(): + # For every row in table... + if index < len(processedRows) and inspectionMarkTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed inspectionMark {inspectionMarkTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{' in str(value): - print('replaced curly braces') - value = str(value).replace('###{new_line', '') - value = str(value).replace('}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification - # for nested semantics, because we need to be efficient. - match key: - case 'id': + # Create Entity property dicts + inspectionMarkValues = {} + creationValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + dimensionValues = {} + featureValues = {} + featureDimensionValues = {} + imageValues = {} + imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} + + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'] = value # UUID - case 'f__9990_kommentar': - inspectionMarkValues['f31eb01562daaeaa27d6c02012fccf02'] = value # Comment - case 'f__67bn_bz_kat_nr': - inspectionMarkValues['f275b0537ab47b15c24f31ad8a8aa226'] = value # Inspection Mark Cataloque Identifer - case 'f__67b0_bz_dok_nr': - inspectionMarkValues['f1cfc4053651e47d629bd5fc9fd707c1'] = value # Inspection Mark Identifier - case 'f__6700_mar_dok_nr_': - inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'] = value # Mark Document Identifier - case 'f__8470_aufnahmenr_': - inspectionMarkValues['f58febbb759a07a75edf9978771c1013'] = value # Recording Number - case 'f__68an_abdruck_nr_': - inspectionMarkValues['f7c155684a82af5caa3191f2646b51da'] = value # Reproduction Number - case 'f__68nk_besonderheiten': - inspectionMarkValues['fd980fca65d9ffd2f95859c4c5b9d284'] = value # Special Feature - case 'f__68ne_haste_schraegstr_': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nf_n_knick': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash_kink'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68ng_ueberg__serifen': - # We map features to Feature entity. - featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = [ - 'transition_serif_haste'] # Type - featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature - featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - case 'f__68nh_dicke_ser__max_': - # We map (features) dimensions to Dimension entity. - featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['maximum_thickness'] # Type - featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__8540_repro_nr_': - # We map images to Image entity - for item in value: - if item is not None: - # Replace dir paths in name - item = item.replace('Objekte\\', 'objects/') - item = item.replace('Objekte3\\', 'objects/') - item = item.replace('Objekte4\\', 'objects/') - item = item.replace('Objekte5\\', 'objects/') - item = item.replace('objekte5\\', 'objects/') - item = item.replace('Marken\\', 'marks/') - item = item.replace('MArken\\', 'marks/') - item = item.replace('Marken/', 'marks/') - imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) - imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File - imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case 'f__ptxt_plug_in_text': - inspectionMarkValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text - case _: - print(f'{key} is not a valid field, skipping.') + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification + # for nested semantics, because we need to be efficient. + match key: + case 'id': + continue + case 'f__uuid': + inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'] = value # UUID + case 'f__9990_kommentar': + inspectionMarkValues['f31eb01562daaeaa27d6c02012fccf02'] = value # Comment + case 'f__67bn_bz_kat_nr': + inspectionMarkValues['f275b0537ab47b15c24f31ad8a8aa226'] = value # Inspection Mark Cataloque Identifer + case 'f__67b0_bz_dok_nr': + inspectionMarkValues['f1cfc4053651e47d629bd5fc9fd707c1'] = value # Inspection Mark Identifier + case 'f__6700_mar_dok_nr_': + inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'] = value # Mark Document Identifier + case 'f__8470_aufnahmenr_': + inspectionMarkValues['f58febbb759a07a75edf9978771c1013'] = value # Recording Number + case 'f__68an_abdruck_nr_': + inspectionMarkValues['f7c155684a82af5caa3191f2646b51da'] = value # Reproduction Number + case 'f__68nk_besonderheiten': + inspectionMarkValues['fd980fca65d9ffd2f95859c4c5b9d284'] = value # Special Feature + case 'f__68ne_haste_schraegstr_': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nf_n_knick': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['transition_haste_slash_kink'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68ng_ueberg__serifen': + # We map features to Feature entity. + featureValues.setdefault(key, {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = [ + 'transition_serif_haste'] # Type + featureValues[key]['fbccee184fa531d58b3b46eb8ac4626f'] = value # Feature + featureValues[key]['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + case 'f__68nh_dicke_ser__max_': + # We map (features) dimensions to Dimension entity. + featureDimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['maximum_thickness'] # Type + featureDimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + featureDimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__8540_repro_nr_': + # We map images to Image entity + for item in value: + if item is not None: + # Replace dir paths in name + item = item.replace('Objekte\\', 'objects/') + item = item.replace('Objekte/', 'objects/') + item = item.replace('Objekte3\\', 'objects/') + item = item.replace('Objekte4\\', 'objects/') + item = item.replace('Objekte5\\', 'objects/') + item = item.replace('objekte5\\', 'objects/') + item = item.replace('Marken\\', 'marks/') + item = item.replace('MArken\\', 'marks/') + item = item.replace('Marken/', 'marks/') + imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) + imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File + imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case 'f__ptxt_plug_in_text': + inspectionMarkValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text + case _: + print(f'{key} is not a valid field, skipping.') - # Create (feature) Dimension entities and add their UUIDs to a list - # because we link Feature and its Dimension over the UUID - featureDimension = [] - for key, value in featureDimensionValues.items(): - if value: - featureDimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') # Dimension Bundle - api.save(featureDimensionItem) - featureDimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) # Dimension UUID + # Create (feature) Dimension entities and add their UUIDs to a list + # because we link Feature and its Dimension over the UUID + featureDimension = [] + for key, value in featureDimensionValues.items(): + if value: + featureDimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') # Dimension Bundle + api.save(featureDimensionItem) + featureDimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) # Dimension UUID - # Add the serif feature t the feature list - if featureDimension: - featureValues.setdefault('serif', {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['serif'] # Feature Type - featureValues['serif']['f0f825f5d3a6f0e2d67eee311b94cd6f'] = featureDimension # Dimension UUIDs - featureValues['serif']['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID + # Add the serif feature t the feature list + if featureDimension: + featureValues.setdefault('serif', {})['fdfb3c4f670aa1260924cecd09ca4bbb'] = ['serif'] # Feature Type + featureValues['serif']['f0f825f5d3a6f0e2d67eee311b94cd6f'] = featureDimension # Dimension UUIDs + featureValues['serif']['f299e2a145b508e376f2bf2e44cbe219'] = [str(uuid.uuid4())] # UUID - # Create Dimension entities and add their UUIDs to a list - # because we link Mark and Dimension over the UUID - feature = [] - for key, value in featureValues.items(): - if value: - featureItem = Entity(api=api, fields=value, bundle_id='b393e1c3db202fbb7a8b54e65eb38227') # Feature Bundle - api.save(featureItem) - feature.append(value['f299e2a145b508e376f2bf2e44cbe219'][0]) # Feature UUID + # Create Dimension entities and add their UUIDs to a list + # because we link Mark and Dimension over the UUID + feature = [] + for key, value in featureValues.items(): + if value: + featureItem = Entity(api=api, fields=value, bundle_id='b393e1c3db202fbb7a8b54e65eb38227') # Feature Bundle + api.save(featureItem) + feature.append(value['f299e2a145b508e376f2bf2e44cbe219'][0]) # Feature UUID - # Create Image entities and add their UUIDs to a list - # because we link Image Assignment and Image over the UUID - imageList = [] - for key, value in imageValues.items(): - if value: - imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') - api.save(imageItem) - imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) + # Create Image entities and add their UUIDs to a list + # because we link Image Assignment and Image over the UUID + imageList = [] + for key, value in imageValues.items(): + if value: + imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') + api.save(imageItem) + imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) - # Create Image Assignment entities and add their UUIDs to a list - # because we link Artifact and Image Assignment over the UUID - if imageList: - imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs - imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') - api.save(imageAssignment) + # Create Image Assignment entities and add their UUIDs to a list + # because we link Artifact and Image Assignment over the UUID + if imageList: + imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs + imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') + api.save(imageAssignment) - # Create Digitisation Process - if digitisationProcessValues: - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) + # Create Digitisation Process + if digitisationProcessValues: + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) - # Add the field values for reference - if feature: - inspectionMarkValues['f7eba97158ff1b9afc5fa0a5823145b4'] = feature # Feature UUID - if imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]: - inspectionMarkValues['fc697a5ad97f3277f20f67e18085b544'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment - if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: - inspectionMarkValues['f998036ccd7daaf2d9938934c93938f3'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + # Add the field values for reference + if feature: + inspectionMarkValues['f7eba97158ff1b9afc5fa0a5823145b4'] = feature # Feature UUID + if imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]: + inspectionMarkValues['fc697a5ad97f3277f20f67e18085b544'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment + if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: + inspectionMarkValues['f998036ccd7daaf2d9938934c93938f3'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - # Create Mark - inspectionMark = Entity(api=api, fields=inspectionMarkValues, bundle_id='baad021dfda9b89d5ba407dd0fca0d03') - api.save(inspectionMark) + # Create Mark + inspectionMark = Entity(api=api, fields=inspectionMarkValues, bundle_id=bundleId) + api.save(inspectionMark) - print(f'Created inspectionMark number {index}: {inspectionMark.uri} of {len(inspectionMarkTable)}') + print(f'Created inspectionMark number {index}: {inspectionMark.uri} of {len(inspectionMarkTable)}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'inspectionMarkId': inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'][0], 'uuid': inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'][0], 'uri': inspectionMark.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedInspectionMarks.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'docId': inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'][0], 'uuid': inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'][0], 'uri': inspectionMark.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing inspection marks') diff --git a/07_importJournalAssignment.py b/07_importJournalAssignment.py index 43f5d25..94a8ab8 100644 --- a/07_importJournalAssignment.py +++ b/07_importJournalAssignment.py @@ -5,84 +5,70 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importJournalAssignment(api, engine): -# Load the environment variables -load_dotenv() -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + test = False + tableName = "c__8310_zeitschrift" + bundleId = 'b5508ef3bb28f139ebdd9f6d545825c4' -test = False -tableName = "c__8310_zeitschrift" -bundleId = 'b5508ef3bb28f139ebdd9f6d545825c4' + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['fadaaac928ec555c2574b3a9a4f5543d'] = value # UUID - fUuid = value[0] - case 'f__8310_zeitschrift': - entityValues['fd8fc741f6d4142637c061900b1cdd01'] = value # Client - case 'f__8312_zusatzzschr': - entityValues['f51edfb30c99d28bee1cf32b81190254'] = value # Date - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['fadaaac928ec555c2574b3a9a4f5543d'] = value # UUID + fUuid = value[0] + case 'f__8310_zeitschrift': + entityValues['fd8fc741f6d4142637c061900b1cdd01'] = value # Client + case 'f__8312_zusatzzschr': + entityValues['f51edfb30c99d28bee1cf32b81190254'] = value # Date + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created journal assignment {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finish') diff --git a/07_importLiteratureReferenceAssignment.py b/07_importLiteratureReferenceAssignment.py index d859333..42504ad 100644 --- a/07_importLiteratureReferenceAssignment.py +++ b/07_importLiteratureReferenceAssignment.py @@ -5,84 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importLiteratureReferenceAssignment(api, engine): + print('Importing literature reference assignments...') -# Load the environment variables -load_dotenv() + tableName = "c__8330_lit_kurzt_" + bundleId = 'bdda154adecb26deed2d8b67dab8a0db' # Literature Reference Assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__8330_lit_kurzt_" -bundleId = 'bdda154adecb26deed2d8b67dab8a0db' # Literature Reference Assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['facb3fc9d13472b00f59d506acece535'] = value # UUID - fUuid = value[0] - case 'f__8334_stelle': - entityValues['f099466b679af216600fdbfa722ddcb7'] = value # Literature Reference - case 'f__833r_repro_datei': - entityValues['fe145f4fec0a71a954bc3c75cf7b370a'] = value # Repro File - case 'f__8330_lit_kurzt_': - entityValues['ff2d656706c2ff11089f196ccab51843'] = value # Short Title - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['facb3fc9d13472b00f59d506acece535'] = value # UUID + fUuid = value[0] + case 'f__8334_stelle': + entityValues['f099466b679af216600fdbfa722ddcb7'] = value # Literature Reference + case 'f__833r_repro_datei': + entityValues['fe145f4fec0a71a954bc3c75cf7b370a'] = value # Repro File + case 'f__8330_lit_kurzt_': + entityValues['ff2d656706c2ff11089f196ccab51843'] = value # Short Title + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created literature reference assignment {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/07_importParentLiteratureAssignment.py b/07_importParentLiteratureAssignment.py index 24174b5..3362a09 100644 --- a/07_importParentLiteratureAssignment.py +++ b/07_importParentLiteratureAssignment.py @@ -5,84 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importParentLiteratureAssignment(api, engine): -# Load the environment variables -load_dotenv() + test = False + tableName = "c__8292_uebergeordn_publ_" + bundleId = 'bf55dda81ca0ddb4237a0d3ea495579b' # Parent literature assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False -tableName = "c__8292_uebergeordn_publ_" -bundleId = 'bf55dda81ca0ddb4237a0d3ea495579b' # Parent literature assignment + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f8cced7d1c2f8d0d3fa9aa36b7e123bd'] = value # UUID - fUuid = value[0] - case 'f__8292_uebergeordn_publ_': - entityValues['f97ea22d9dd853c8f1cced6bc85c59b2'] = value # Parent literature - case 'f__8294_zusatzsatit': - entityValues['faf62c71a8e5844241899c0aa7801a9c'] = value # Subtitle - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f8cced7d1c2f8d0d3fa9aa36b7e123bd'] = value # UUID + fUuid = value[0] + case 'f__8292_uebergeordn_publ_': + entityValues['f97ea22d9dd853c8f1cced6bc85c59b2'] = value # Parent literature + case 'f__8294_zusatzsatit': + entityValues['faf62c71a8e5844241899c0aa7801a9c'] = value # Subtitle + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created parent literature assignment {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finish') diff --git a/08_importInspectionMarkLocation.py b/08_importInspectionMarkLocation.py index 04090f5..3c3a1da 100644 --- a/08_importInspectionMarkLocation.py +++ b/08_importInspectionMarkLocation.py @@ -5,81 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMarkLocation(api, engine): + print('Importing inspection mark locations...') -# Load the environment variables -load_dotenv() + tableName = 'c__67b0_bz_dok_nr' + bundleId = 'b4158ec3a326d8ab504062296a82f13a' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processedInspectionMarkLocation.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) + # Load sources table + inspectionMarkLocationsTable = pd.read_sql_table(tableName, con=engine) -# Load sources table -inspectionMarkLocationsTable = pd.read_sql_table('c__67b0_bz_dok_nr', con=engine) - -inspectionMarkLocationValues = {} - -# Create inspectionMarkLocations -for index, row in inspectionMarkLocationsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and inspectionMarkLocationsTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed inspectionMarkLocation {inspectionMarkLocationsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - inspectionMarkLocationValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create inspectionMarkLocations + for index, row in inspectionMarkLocationsTable.iterrows(): + # For every row in table... + if index < len(processedRows) and inspectionMarkLocationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed inspectionMarkLocation {inspectionMarkLocationsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{' in str(value): - print('replaced curly braces') - value = str(value).replace('###{new_line', '') - value = str(value).replace('}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + inspectionMarkLocationValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'] = value # UUID - case 'f__67b0_bz_dok_nr': - inspectionMarkLocationValues['f2d0b120ed40e17a5ad3f31d594d9b1c'] = value # Inspection Mark Identifier - case 'f__67b4_anbr_ort': - inspectionMarkLocationValues['f8a6343c2a8a5523eb2f0602f2baae04'] = value # Location + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'] = value # UUID + case 'f__67b0_bz_dok_nr': + inspectionMarkLocationValues['f2d0b120ed40e17a5ad3f31d594d9b1c'] = value # Inspection Mark Identifier + case 'f__67b4_anbr_ort': + inspectionMarkLocationValues['f8a6343c2a8a5523eb2f0602f2baae04'] = value # Location - case _: - print(f'{key} is not a valid field, skipping.') + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - inspectionMarkLocation = Entity(api=api, fields=inspectionMarkLocationValues, bundle_id='b4158ec3a326d8ab504062296a82f13a') - api.save(inspectionMarkLocation) + # Create Material + inspectionMarkLocation = Entity(api=api, fields=inspectionMarkLocationValues, bundle_id=bundleId) + api.save(inspectionMarkLocation) - print(f'Created inspectionMarkLocation {index}: {inspectionMarkLocation.uri}') + print(f'Created inspectionMarkLocation {index}: {inspectionMarkLocation.uri}') - # Write log - processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'][0], 'uri': inspectionMarkLocation.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedInspectionMarkLocation.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'][0], 'uri': inspectionMarkLocation.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing inspection mark locations') diff --git a/09_importInspectionMarkRelation.py b/09_importInspectionMarkRelation.py index e3e5bcf..b2c5f50 100644 --- a/09_importInspectionMarkRelation.py +++ b/09_importInspectionMarkRelation.py @@ -5,80 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMarkRelation(api, engine): + print('Importing inspection mark relations...') -# Load the environment variables -load_dotenv() + tableName = 'c__67b7_beziehung' + bundleId = 'bd9b0ff8dc3a6d9284e1798531389bf1' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processedInspectionMarkRelation.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) + # Load sources table + inspectionMarkRelationsTable = pd.read_sql_table(tableName, con=engine) -# Load sources table -inspectionMarkRelationsTable = pd.read_sql_table('c__67b7_beziehung', con=engine) - -inspectionMarkRelationValues = {} - -# Create inspectionMarkRelations -for index, row in inspectionMarkRelationsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and inspectionMarkRelationsTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed inspectionMarkRelation {inspectionMarkRelationsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - inspectionMarkRelationValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create inspectionMarkRelations + for index, row in inspectionMarkRelationsTable.iterrows(): + # For every row in table... + if index < len(processedRows) and inspectionMarkRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed inspectionMarkRelation {inspectionMarkRelationsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + inspectionMarkRelationValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'] = value # UUID - case 'f__67b8_bez_bz_nr': - inspectionMarkRelationValues['ff3f6dd331ed27515f6721ac8312706c'] = value # Inspection Mark Identifier - case 'f__67b7_beziehung': - inspectionMarkRelationValues['f1cb8db7e1c26a5b5fe0c9d8fca60de2'] = value # Relation + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'] = value # UUID + case 'f__67b8_bez_bz_nr': + inspectionMarkRelationValues['ff3f6dd331ed27515f6721ac8312706c'] = value # Inspection Mark Identifier + case 'f__67b7_beziehung': + inspectionMarkRelationValues['f1cb8db7e1c26a5b5fe0c9d8fca60de2'] = value # Relation - case _: - print(f'{key} is not a valid field, skipping.') + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - inspectionMarkRelation = Entity(api=api, fields=inspectionMarkRelationValues, bundle_id='bd9b0ff8dc3a6d9284e1798531389bf1') - api.save(inspectionMarkRelation) + # Create Material + inspectionMarkRelation = Entity(api=api, fields=inspectionMarkRelationValues, bundle_id='bd9b0ff8dc3a6d9284e1798531389bf1') + api.save(inspectionMarkRelation) - print(f'Created inspectionMarkRelation {index}: {inspectionMarkRelation.uri}') + print(f'Created inspection mark relation {index}: {inspectionMarkRelation.uri}') - # Write log - processedRows = processedRows._append({'uuid': inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'][0], 'uri': inspectionMarkRelation.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedInspectionMarkRelation.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'][0], 'uri': inspectionMarkRelation.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/10_importMarkDatingInfo.py b/10_importMarkDatingInfo.py index 058bb90..d4de6a8 100644 --- a/10_importMarkDatingInfo.py +++ b/10_importMarkDatingInfo.py @@ -5,80 +5,65 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarkDatingInfo(api, engine): + print('Importing mark dating info...') + tableName = 'c__68dm_datierung_marke' + bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load sources table + datingInfosTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processedDatingInfo.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) - -# Load sources table -datingInfosTable = pd.read_sql_table('c__68dm_datierung_marke', con=engine) - -datingInfoValues = {} - -# Create datingInfos -for index, row in datingInfosTable.iterrows(): - # For every row in table... - if index < len(processedRows) and datingInfosTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed datingInfo {datingInfosTable.iloc[index, 0]}') - continue - # Create Entity property dicts - datingInfoValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create datingInfos + for index, row in datingInfosTable.iterrows(): + # For every row in table... + if index < len(processedRows) and datingInfosTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed datingInfo {datingInfosTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '###{{new_line}}###' in str(value): - print('replaced curly braces') - value = str(value).replace('###{{new_line}}###', '') - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': + # Create Entity property dicts + datingInfoValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value continue - case 'f__uuid': - datingInfoValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID - case 'f__68dm_datierung_marke': - datingInfoValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date - case 'f__68bm_bem_dat_marke': - datingInfoValues['fe7870b5a86040d81140bccb01697765'] = value # Note + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + datingInfoValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID + case 'f__68dm_datierung_marke': + datingInfoValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date + case 'f__68bm_bem_dat_marke': + datingInfoValues['fe7870b5a86040d81140bccb01697765'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - datingInfo = Entity(api=api, fields=datingInfoValues, bundle_id='b9cfb95e627e1710cf8d736d4ca5db64') #Dating Information Assignment - api.save(datingInfo) + # Create Material + datingInfo = Entity(api=api, fields=datingInfoValues, bundle_id='b9cfb95e627e1710cf8d736d4ca5db64') #Dating Information Assignment + api.save(datingInfo) - print(f'Created datingInfo {index}: {datingInfo.uri} of {len(datingInfosTable)}') + print(f'Created mark dating info {index}: {datingInfo.uri} of {len(datingInfosTable)}') - # Write log - processedRows = processedRows._append({'uuid': datingInfoValues['f74baaf58e49393cc89d6616ee197901'][0], 'uri': datingInfo.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedDatingInfo.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': datingInfoValues['f74baaf58e49393cc89d6616ee197901'][0], 'uri': datingInfo.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing mark dating info') diff --git a/11_importMarkInformation.py b/11_importMarkInformation.py deleted file mode 100644 index d6d50fb..0000000 --- a/11_importMarkInformation.py +++ /dev/null @@ -1,97 +0,0 @@ -import uuid # For UUID creation -from initDb import initDb # For database initialization -from wisski.api import Api, Pathbuilder, Entity # For WissKI API -import os # For environment variable loading -from dotenv import load_dotenv # For environment variable loading -import pandas as pd # For dataframe handling - -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() - -# Load the environment variables -load_dotenv() - -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') - - -tableName = "c__6760_markenart" -bundleId = 'bc7ce6906f78e760f22ff13226b1332d' # Mark information assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value - continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - continue - case 'f__uuid': - entityValues['f3b8aaf7e79229b4da8214d491e375ec'] = value # UUID - fUuid = value[0] - case 'f__5064_num__dat_': - entityValues['fe6921098808e68cae68f0858411826c'] = value # Artist Assignment - case 'f__6894_anbr_ort': - entityValues['f694ed57271ab7be57249e0ee5c41ba4'] = value # Location - case 'f__6700_mar_dok_nr_': - entityValues['fdd3380d4a11654f32687429796cabc3'] = value # Mark Document Number - case 'f__6760_markenart': - entityValues['fd381aa9c3ebdf417e6cbccd60ede279'] = value # Mark Type - case 'f__684c_bedeutung_bz': - entityValues['f4947de52885f517baef0cdf3cb53b61'] = value # Meaning Inspection Mark - case 'f__684a_bedeutung_mz': - entityValues['f542c4c945725c6fdc5ab6409a877f02'] = value # Meaning Master Mark - case 'f__6770_rosenb_nr_': - entityValues['f0ff7020a9c25ea2706875837fe61b04'] = value # Rosenberg Number - - case _: - print(f'{key} is not a valid field, skipping.') - - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) - - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') - - # Write log - processedRows = processedRows._append({'id': row['id'], 'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - -print('finish') diff --git a/12_importBirth.py b/12_importBirth.py index aa2bde1..ad51c87 100644 --- a/12_importBirth.py +++ b/12_importBirth.py @@ -5,94 +5,80 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importBirth(api, engine): + print('Importing birth...') -# Load the environment variables -load_dotenv() + test = False + tableName = "c__3270_geb_datum" + bundleId = 'b54049ec931bffb62359b4bdb11435fc' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False -tableName = "c__3270_geb_datum" -bundleId = 'b54049ec931bffb62359b4bdb11435fc' + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['ff2a4da76944f5aba7d625c169d9ff66'] = value # UUID - fUuid = value[0] - case 'f__3290_geb_ort': - entityValues['fe71d86a78289c0b54242f5a3b67f81f'] = value # Birth place - case 'f__3270_geb_datum': - entityValues['ff3a9f042976963ac356db02d764b002'] = value # Date - case 'f__32ls_lit__stelle': - entityValues['fa03638df8a53e9aae38471fe10f409a'] = value # Literature Reference - case 'f__32lt_lit__kurztitel': - entityValues['f1af25f1770bd0db1982780697600cf4'] = value # Literature short title - case 'f__32bm_bem_geburt': - entityValues['f572f5e0f02f1c9b7c3ece5ffcf86c43'] = value # Note - case 'f__32qs_quelle_stelle': - entityValues['f1ebceaa76bac9ebf266733f64caa37c'] = value # Source reference - case 'f__32qt_quelle_kurztitel': - entityValues['f1a3597a874b3df9c1d87c5a32b487b0'] = value # Source short title - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['ff2a4da76944f5aba7d625c169d9ff66'] = value # UUID + fUuid = value[0] + case 'f__3290_geb_ort': + entityValues['fe71d86a78289c0b54242f5a3b67f81f'] = value # Birth place + case 'f__3270_geb_datum': + entityValues['ff3a9f042976963ac356db02d764b002'] = value # Date + case 'f__32ls_lit__stelle': + entityValues['fa03638df8a53e9aae38471fe10f409a'] = value # Literature Reference + case 'f__32lt_lit__kurztitel': + entityValues['f1af25f1770bd0db1982780697600cf4'] = value # Literature short title + case 'f__32bm_bem_geburt': + entityValues['f572f5e0f02f1c9b7c3ece5ffcf86c43'] = value # Note + case 'f__32qs_quelle_stelle': + entityValues['f1ebceaa76bac9ebf266733f64caa37c'] = value # Source reference + case 'f__32qt_quelle_kurztitel': + entityValues['f1a3597a874b3df9c1d87c5a32b487b0'] = value # Source short title + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created birth {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finished importing birth') diff --git a/13_importDeath.py b/13_importDeath.py index b04bc35..4684fee 100644 --- a/13_importDeath.py +++ b/13_importDeath.py @@ -5,98 +5,84 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importDeath(api, engine): + print('Importing death...') -# Load the environment variables -load_dotenv() + test = False -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + tableName = "c__3330_todes_dat_" + bundleId = 'b487c08016f572b9ecf3f9173339fec3' -test = True + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -tableName = "c__3330_todes_dat_" -bundleId = 'b487c08016f572b9ecf3f9173339fec3' + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f8beb0d372a5cf6f1668c47acf7e53cd'] = value # UUID - fUuid = value[0] - case 'f__3330_todes_dat_': - entityValues['f385a8c323f0a2f49d8eb175e1535b1b'] = value # Death date - case 'f__33ls_lit__stelle': - entityValues['fb4f168aa6a73169ef0350408a6260cc'] = value # Literature Reference - case 'f__33lt_lit__kurztitel': - entityValues['fd4ed8828d72a575f8609ba2c442b4b2'] = value # Literature short title - case 'f__33bm_bem_tod': - entityValues['f3028661430081ae44aa950abe0afbac'] = value # Note - case 'f__3350_tod_ort': - entityValues['fd80c2c8ba4c64c01e9c46ac7ae00d93'] = value # Place - case 'f__33qs_quelle_stelle': - entityValues['fd98cf7fbc0de4529e2a2d5e0b0c28bf'] = value # Source reference - case 'f__33qt_quelle_kurztitel': - entityValues['f973818e6c3d36ddd44ba3a713e308e6'] = value # Source short title - case 'f__710t_art_ereignis': - entityValues['fc039c43502b3525a92a8330d91f7944'] = value # Event type - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f8beb0d372a5cf6f1668c47acf7e53cd'] = value # UUID + fUuid = value[0] + case 'f__3330_todes_dat_': + entityValues['f385a8c323f0a2f49d8eb175e1535b1b'] = value # Death date + case 'f__33ls_lit__stelle': + entityValues['fb4f168aa6a73169ef0350408a6260cc'] = value # Literature Reference + case 'f__33lt_lit__kurztitel': + entityValues['fd4ed8828d72a575f8609ba2c442b4b2'] = value # Literature short title + case 'f__33bm_bem_tod': + entityValues['f3028661430081ae44aa950abe0afbac'] = value # Note + case 'f__3350_tod_ort': + entityValues['fd80c2c8ba4c64c01e9c46ac7ae00d93'] = value # Place + case 'f__33qs_quelle_stelle': + entityValues['fd98cf7fbc0de4529e2a2d5e0b0c28bf'] = value # Source reference + case 'f__33qt_quelle_kurztitel': + entityValues['f973818e6c3d36ddd44ba3a713e308e6'] = value # Source short title + case 'f__710t_art_ereignis': + entityValues['fc039c43502b3525a92a8330d91f7944'] = value # Event type + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created death {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) - if test: - break + if test: + break -print('finish') + print('finish') diff --git a/14_importDating.py b/14_importDating.py index 36178e1..d20b741 100644 --- a/14_importDating.py +++ b/14_importDating.py @@ -5,82 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importDating(api, engine): + print('Importing dating...') -# Load the environment variables -load_dotenv() + tableName = "c__8100_datum" + bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__8100_datum" -bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID - uuid = value[0] - case 'f__8100_datum': - entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date - case 'f__81bm_bem__datierung': - entityValues['fe7870b5a86040d81140bccb01697765'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID + fUuid = value[0] + case 'f__8100_datum': + entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date + case 'f__81bm_bem__datierung': + entityValues['fe7870b5a86040d81140bccb01697765'] = value # Note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created dating {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/15_importGoldsmithRelation.py b/15_importGoldsmithRelation.py index 2c1ec28..cbae38a 100644 --- a/15_importGoldsmithRelation.py +++ b/15_importGoldsmithRelation.py @@ -5,89 +5,75 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importGoldsmithRelation(api, engine): + print('Importing goldsmith relation...') -# Load the environment variables -load_dotenv() + test = False -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + tableName = "c__3007_bezieh__zu_gs" + bundleId = 'bef43e8a958e6a9bee04534b3841f6a0' -test = False + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -tableName = "c__3007_bezieh__zu_gs" -bundleId = 'bef43e8a958e6a9bee04534b3841f6a0' + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f588ff2629e3758ae18ec28c02270d27'] = value # UUID - fUuid = value[0] - case 'f__3011_verw__art': - entityValues['f2de276528d6b020306b8c7784008e5c'] = value # Actor relation type - case 'f__3010_name_gs': - entityValues['fc16719402aff4a1afec3387bf2bbc34'] = value # Goldsmith - case 'f__30bm_bem_beziehung': - entityValues['f7de6b267146070fa38ea5dc45150fa4'] = value # Note - case 'f__3007_bezieh__zu_gs': - entityValues['f8a46491ebad0ba670384a049402d697'] = value # Relation - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f588ff2629e3758ae18ec28c02270d27'] = value # UUID + fUuid = value[0] + case 'f__3011_verw__art': + entityValues['f2de276528d6b020306b8c7784008e5c'] = value # Actor relation type + case 'f__3010_name_gs': + entityValues['fc16719402aff4a1afec3387bf2bbc34'] = value # Goldsmith + case 'f__30bm_bem_beziehung': + entityValues['f7de6b267146070fa38ea5dc45150fa4'] = value # Note + case 'f__3007_bezieh__zu_gs': + entityValues['f8a46491ebad0ba670384a049402d697'] = value # Relation + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created goldsmith relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finished importing goldsmith relation') diff --git a/16_importClient.py b/16_importClient.py index 6ddcef9..359e2cb 100644 --- a/16_importClient.py +++ b/16_importClient.py @@ -5,86 +5,72 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() - -# Load the environment variables -load_dotenv() - -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') +def importClient(api, engine): + print('Importing client...') -tableName = "c__410a_auftraggeber" -bundleId = 'b85d9987d762fb4e8ce89a69b0b8de31' + tableName = "c__410a_auftraggeber" + bundleId = 'b85d9987d762fb4e8ce89a69b0b8de31' -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['fe0c458dfe9c0657fd02f312c2154d62'] = value # UUID - fUuid = value[0] - case 'f__410a_auftraggeber': - entityValues['f5ab8fb89d793bd5d27740c2b26bf672'] = value # Client - case 'f__41bm_bem__auftragg_': - entityValues['f0f33e0d5b40933d83260da3876a6cd3'] = value # Note - case 'f__41aa_anlass_auftrag': - entityValues['f88f0dbbcaff35acc80f1e6be571bd9e'] = value # Reason - case 'f__41as_stand_auftragg_': - entityValues['f9d4601e72d705c12fd7f09560e90d37'] = value # Status - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['fe0c458dfe9c0657fd02f312c2154d62'] = value # UUID + fUuid = value[0] + case 'f__410a_auftraggeber': + entityValues['f5ab8fb89d793bd5d27740c2b26bf672'] = value # Client + case 'f__41bm_bem__auftragg_': + entityValues['f0f33e0d5b40933d83260da3876a6cd3'] = value # Note + case 'f__41aa_anlass_auftrag': + entityValues['f88f0dbbcaff35acc80f1e6be571bd9e'] = value # Reason + case 'f__41as_stand_auftragg_': + entityValues['f9d4601e72d705c12fd7f09560e90d37'] = value # Status + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(tableName)}') + print(f'Created client {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/17_importMentioned.py b/17_importMentioned.py index a58cdf5..d69a83e 100644 --- a/17_importMentioned.py +++ b/17_importMentioned.py @@ -5,100 +5,86 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMentioned(api, engine): + print('Importing mentioned...') -# Load the environment variables -load_dotenv() + test = False + tableName = "c__7060_erwaehnt__datum_" + bundleId = 'b04b1756b09ba3260de278824332ad6c' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False -tableName = "c__7060_erwaehnt__datum_" -bundleId = 'b04b1756b09ba3260de278824332ad6c' + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['fac8bbc9701f5da711a6a49beca1b3e4'] = value # UUID - fUuid = value[0] - case 'f__410a_auftraggeber': - entityValues['f6b456466f45f72952a953bf169a47cc'] = value # Client - case 'f__7060_erwaehnt__datum_': - entityValues['ffdae7d7aeb84467faebf5468fb8b94f'] = value # Date - case 'f__7100_art_ereignis': - entityValues['fb462fbc544045fc244da8d490ed1cfc'] = value # Event type - case 'f__70ls_lit__stelle': - entityValues['f11f8bc3fdbedc686430ef57edfcf620'] = value # Literature Reference - case 'f__70lt_lit__kurztitel': - entityValues['f4ed2a340720f643bcc49ac9581b1181'] = value # Literature short title - case 'f__34ms_bei_meister_': - entityValues['f9d8ac79df3eb667db8fb8b23e52a816'] = value # Master - case 'f__70bm_bem_ereignis': - entityValues['f37dbed94d03576c91fff9c3c9026da5'] = value # Note - case 'f__70qs_quelle_stelle': - entityValues['ffc72e8058fd9efd4bb92270520942bd'] = value # Source reference - case 'f__70qt_quelle_kurztitel': - entityValues['f433afdf58621b6962dea8821cf21bb9'] = value # Source short title - case 'f__3420_taet_ort': - entityValues['f53e436b293c82f07fb17dd40c01f868'] = value # Workplace - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['fac8bbc9701f5da711a6a49beca1b3e4'] = value # UUID + fUuid = value[0] + case 'f__410a_auftraggeber': + entityValues['f6b456466f45f72952a953bf169a47cc'] = value # Client + case 'f__7060_erwaehnt__datum_': + entityValues['ffdae7d7aeb84467faebf5468fb8b94f'] = value # Date + case 'f__7100_art_ereignis': + entityValues['fb462fbc544045fc244da8d490ed1cfc'] = value # Event type + case 'f__70ls_lit__stelle': + entityValues['f11f8bc3fdbedc686430ef57edfcf620'] = value # Literature Reference + case 'f__70lt_lit__kurztitel': + entityValues['f4ed2a340720f643bcc49ac9581b1181'] = value # Literature short title + case 'f__34ms_bei_meister_': + entityValues['f9d8ac79df3eb667db8fb8b23e52a816'] = value # Master + case 'f__70bm_bem_ereignis': + entityValues['f37dbed94d03576c91fff9c3c9026da5'] = value # Note + case 'f__70qs_quelle_stelle': + entityValues['ffc72e8058fd9efd4bb92270520942bd'] = value # Source reference + case 'f__70qt_quelle_kurztitel': + entityValues['f433afdf58621b6962dea8821cf21bb9'] = value # Source short title + case 'f__3420_taet_ort': + entityValues['f53e436b293c82f07fb17dd40c01f868'] = value # Workplace + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(tableName)}') + print(f'Created mentioned {index}: {entity.uri} of {len(tableName)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finish') diff --git a/18_importNumDating.py b/18_importNumDating.py index 6284f1b..4c77e30 100644 --- a/18_importNumDating.py +++ b/18_importNumDating.py @@ -5,82 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importNumDating(api, engine): + print('Importing num dating...') -# Load the environment variables -load_dotenv() + tableName = "c__5064_num__dat_" + bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' # Dating Information Assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__5064_num__dat_" -bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' # Dating Information Assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.iloc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID - uuid = value[0] - case 'f__5064_num__dat_': - entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date - case 'f__50bm_bem__datierung': - entityValues['fe7870b5a86040d81140bccb01697765'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID + fUuid = value[0] + case 'f__5064_num__dat_': + entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date + case 'f__50bm_bem__datierung': + entityValues['fe7870b5a86040d81140bccb01697765'] = value # Note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(tableName)}') + print(f'Created num dating {index}: {entity.uri} of {len(tableName)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finish') diff --git a/19_importOriginAssignment.py b/19_importOriginAssignment.py index 0662aea..86eacf4 100644 --- a/19_importOriginAssignment.py +++ b/19_importOriginAssignment.py @@ -5,85 +5,70 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importOriginAssignment(api, engine): + print('Importing origin assignment...') + test = False + tableName = "c__3204_herkunft" + bundleId = 'b1d5be81f8b3dfbf9d6d90379cc0a14f' -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -test = False -tableName = "c__3204_herkunft" -bundleId = 'b1d5be81f8b3dfbf9d6d90379cc0a14f' - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f0d656adf9a5a9501e2f837af2e71dd6'] = value # UUID - fUuid = value[0] - case 'f__3hbm_bem_herkunft': - entityValues['f3755949b812523c5d2005ea831c122f'] = value # Note - case 'f__3204_herkunft': - entityValues['fecbc849373f6a48c23be62619da3b09'] = value # Place - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f0d656adf9a5a9501e2f837af2e71dd6'] = value # UUID + fUuid = value[0] + case 'f__3hbm_bem_herkunft': + entityValues['f3755949b812523c5d2005ea831c122f'] = value # Note + case 'f__3204_herkunft': + entityValues['fecbc849373f6a48c23be62619da3b09'] = value # Place + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created origin assignment {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) - if test: - exit() + if test: + exit() -print('finish') + print('finished importing origin assignments') diff --git a/20_importWorkshops.py b/20_importWorkshops.py index 152f4cf..ee39652 100644 --- a/20_importWorkshops.py +++ b/20_importWorkshops.py @@ -5,82 +5,71 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importWorkshops(api, engine): + print('Importing workshops...') -# Load the environment variables -load_dotenv() + tableName = "c__nfws_forts_werkst_" + bundleId = 'beb03bccbdffdd31567df370303c1e2d' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processedWorkshops.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) + test = False + # Load sources table + workshopsTable = pd.read_sql_table(tableName, con=engine) -test = False -# Load sources table -workshopsTable = pd.read_sql_table('c__nfws_forts_werkst_', con=engine) - -workshopValues = {} - -# Create workshops -for index, row in workshopsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and workshopsTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed workshop {workshopsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - workshopValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create workshops + for index, row in workshopsTable.iterrows(): + # For every row in table... + if index < len(processedRows) and workshopsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {workshopsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': - docId = value[0] - case 'f__uuid': - workshopValues['fa7c19f4d03d7d15acf588460654bbf2'] = value # UUID - case 'f__nfws_forts_werkst_': - workshopValues['ff1aaeb118005d8506af6f56f7e424a4'] = value # Continued by - case 'f__nfbm_bem_forts_': - workshopValues['f71d24e2922d3151603ce144c0972f40'] = value # Note - case 'f__nfzr_zeitraumforts_': - workshopValues['f865ade60ba332a0a3ab4b77c39af7f4'] = value # Time-Span - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + workshopValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + docId = value[0] + case 'f__uuid': + workshopValues['fa7c19f4d03d7d15acf588460654bbf2'] = value # UUID + case 'f__nfws_forts_werkst_': + workshopValues['ff1aaeb118005d8506af6f56f7e424a4'] = value # Continued by + case 'f__nfbm_bem_forts_': + workshopValues['f71d24e2922d3151603ce144c0972f40'] = value # Note + case 'f__nfzr_zeitraumforts_': + workshopValues['f865ade60ba332a0a3ab4b77c39af7f4'] = value # Time-Span + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - workshop = Entity(api=api, fields=workshopValues, bundle_id='beb03bccbdffdd31567df370303c1e2d') - api.save(workshop) + # Create Material + workshop = Entity(api=api, fields=workshopValues, bundle_id=bundleId) + api.save(workshop) - print(f'Created workshop {index}: {workshop.uri} of {len(workshopsTable)}') + print(f'Created workshop {index}: {workshop.uri} of {len(workshopsTable)}') - # Write log - processedRows = processedRows._append({'uuid': workshopValues['fa7c19f4d03d7d15acf588460654bbf2'][0], 'uri': workshop.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedWorkshops.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': workshopValues['fa7c19f4d03d7d15acf588460654bbf2'][0], 'uri': workshop.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) - if test: - exit() + if test: + exit() -print('finish') + print('finished importing workshops') diff --git a/21_importArtifacts.py b/21_importArtifacts.py new file mode 100644 index 0000000..7cdab40 --- /dev/null +++ b/21_importArtifacts.py @@ -0,0 +1,207 @@ +import uuid # For UUID creation +from initDb import initDb # For database initialization +from wisski.api import Api, Pathbuilder, Entity # For WissKI API +import os # For environment variable loading +from dotenv import load_dotenv # For environment variable loading +import pandas as pd # For dataframe handling + +def importArtifacts(api, engine): + print('Importing artifacts...') + + tableName = "c__obj" + bundleId = 'bd30c2c64a3caa8bb1628c780c3f24bb' + + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) + + # Load artifacts table + artifactsTable = pd.read_sql_table(tableName, con=engine) + + # Create artifacts + for index, row in artifactsTable.iterrows(): + # For every row in table... + if index < len(processedRows) and artifactsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed artifact {artifactsTable.loc[index, "id"]}') + continue + # Create Entity property dicts + artifactValues = {} + creationValues = {} + digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} + imageValues = {} + imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} + productionPlaceAssignmentValues = {'f40cc95db3ccaa1dbbf27294338d9f07': [str(uuid.uuid4())]} + dimensionValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + value = str(value).replace(' & ', '&') + + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + continue + case 'f__uuid': + artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'] = value # UUID + case 'f__5000_obj_dok_nr_': + artifactValues['f7e2a8a273ab3d577bf5854902550c09'] = value # Document Identifier + docId = value[0] + case 'f__500n_ngk_nr_': + artifactValues['f6e041bd0b16b21596849732c01cb168'] = value # NGK Number + case 'f__5130_entst_ort': + # We map productions place to Production Place Assignment entity. + productionPlaceAssignmentValues['f43f9589eef324fb12c26226dfe94246'] = value # Production Place + case 'f__5200_obj_titel': + artifactValues['fd06dcc49a29b1a63fa4a789ec17e5c6'] = value # Title + case 'f__5210_status': + artifactValues['f35c9c9b0991729c36acb41645fe81d1'] = value # Status + case 'f__5220_gattung': + artifactValues['f2fd7f8a81d5eb1a20371b9acfd1ab59'] = value # Genre + case 'f__5223_form__attribut': + artifactValues['f05bbd6e29a7d303e4370b04c12b3f75'] = value # Formattribute + case 'f__5226_art': + artifactValues['f593fa773a6ea458101ba2325a18abbe'] = value # artifact type + case 'f__523f_funktion': + artifactValues['f476ba24127d4dff1018acebf45a05f6'] = value # Function + case 'f__5240_formtyp': + artifactValues['fa7cfd9dbb3d2517c1898b3051d8dbed'] = value # Shape + case 'f__524g_gestalt': + artifactValues['f8309a21fa79bc6bd2506060b419d2df'] = value # Figure + case 'f__5362_hoehe': + # We map dimensions to Dimension entity. + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['height'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + + case 'f__5364_breite': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + + case 'f__5366_tiefe': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['depth'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + + case 'f__5368_laenge': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['length'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + + case 'f__5370_durchmesser': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['diameter'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + + case 'f__5380_gewicht': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['weight'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__538h_hist__gewicht': + dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['historical_weight'] # Type + dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension + dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID + case 'f__55ng_darst__schlagw_': + artifactValues['f6abbd4f39a6f79de5de2b14b98e51ff'] = value # Keywords + case 'f__5bes_beschreibung': + artifactValues['f26ad2bc1f084478cd7011f7b8451526'] = value # Description + case 'f__5ges_geschichte': + artifactValues['f40120d7c13ef02b486c69245f6c2306'] = value # History + case 'f__68an_abdruck_nr_': + artifactValues['fd3740649cc06f45677eb0546908cdac'] = value # Print Number + case 'f__8540_repro_nr_': + # We map images to Image entity + for item in value: + if item is not None: + # Replace dir paths in name + item = item.replace('Objekte/', 'objects/') + item = item.replace('Objekte\\', 'objects/') + item = item.replace('Objekte3\\', 'objects/') + item = item.replace('Objekte4\\', 'objects/') + item = item.replace('objekte4\\', 'objects/') + item = item.replace('Objekte5\\', 'objects/') + item = item.replace('objekte5\\', 'objects/') + item = item.replace('Marken\\', 'marks/') + item = item.replace('Marken/', 'marks/') + imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) + imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File + imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID + case 'f__stwv_statwerkverz': + artifactValues['fee0db94d62fae6370a89ff4757ff539'] = value # Catalogue_of_Works + case 'f__9990_kommentar': + artifactValues['fefe289aa0c9563a153be6da7d37e3ff'] = value # Comment + case 'f__9900_datum_erfassung': + digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date + case 'f__99ae_datum_aenderung': + digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date + case 'f__efbm_bem_erfassung': + digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note + case 'f__ptxt_plug_in_text': + artifactValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text + case _: + print(f'{key} is not a valid field, skipping.') + + # Create Production Place Assignment + productionPlaceAssignment = Entity(api=api, fields=productionPlaceAssignmentValues, bundle_id='b13bc6dc04d4bbdafb9536987eb43244') + api.save(productionPlaceAssignment) # Kai says, we can save all entities at once, but I save it instantly + + + # Create Dimension entities and add their UUIDs to a list + # because we link Artifact and Dimension over the UUID + dimension = [] + for key, value in dimensionValues.items(): + dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') + api.save(dimensionItem) + dimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) + + # Create Image entities and add their UUIDs to a list + # because we link Image Assignment and Image over the UUID + imageList = [] + for key, value in imageValues.items(): + imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') + api.save(imageItem) + imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) + + # Create Image Assignment entities and add their UUIDs to a list + # because we link Artifact and Image Assignment over the UUID + if imageList: + imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs + imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') + api.save(imageAssignment) + + # Create Digitisation Process + digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') + api.save(digitisationProcess) + + # Add the field values for reference + # UWAGA! Is the Value Production Place Assignment Correct? UWAGA! + artifactValues['f2676a0fb8db6ab62235328ae7c7a4b3'] = [productionPlaceAssignmentValues['f40cc95db3ccaa1dbbf27294338d9f07'][0]] # Production Place Assignment + artifactValues['fc700eb3f24f4f2a6c165128aa7117f1'] = dimension # Dimension + artifactValues['f7af1cd9c77448281dd7ecf29ba57e3e'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment + artifactValues['f5a3f90d920da3db4cfdbaa6264b0e89'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process + + # Create Artifact + artifact = Entity(api=api, fields=artifactValues, bundle_id=bundleId) + api.save(artifact) + + print(f'Created artifact {index}: {artifact.uri} of {len(artifactsTable)}') + + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'][0], 'uri': artifact.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + + print('finished importing artifacts') diff --git a/21_importArtifcats.py b/21_importArtifcats.py deleted file mode 100644 index 0e23833..0000000 --- a/21_importArtifcats.py +++ /dev/null @@ -1,213 +0,0 @@ -import uuid # For UUID creation -from initDb import initDb # For database initialization -from wisski.api import Api, Pathbuilder, Entity # For WissKI API -import os # For environment variable loading -from dotenv import load_dotenv # For environment variable loading -import pandas as pd # For dataframe handling - -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() - -# Load the environment variables -load_dotenv() - -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') - -try: - processedRows = pd.read_csv(f'./logs/processedArtifacts.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['artifactId', 'uuid', 'uri']) - -# Load artifacts table -artifactsTable = pd.read_sql_table('c__obj', con=engine) - -# Create artifacts -for index, row in artifactsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and artifactsTable.iloc[index, 0] == processedRows.iloc[index, 0]: - # skip if already processed - print(f'Skipping already processed artifact {artifactsTable.iloc[index, 0]}') - continue - # Create Entity property dicts - artifactValues = {} - creationValues = {} - digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} - imageValues = {} - imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]} - productionPlaceAssignmentValues = {'f40cc95db3ccaa1dbbf27294338d9f07': [str(uuid.uuid4())]} - dimensionValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value - continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': - continue - case 'f__uuid': - artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'] = value # UUID - case 'f__5000_obj_dok_nr_': - artifactValues['f7e2a8a273ab3d577bf5854902550c09'] = value # Document Identifier - docId = value[0] - case 'f__500n_ngk_nr_': - artifactValues['f6e041bd0b16b21596849732c01cb168'] = value # NGK Number - case 'f__5130_entst_ort': - # We map productions place to Production Place Assignment entity. - productionPlaceAssignmentValues['f43f9589eef324fb12c26226dfe94246'] = value # Production Place - case 'f__5200_obj_titel': - artifactValues['fd06dcc49a29b1a63fa4a789ec17e5c6'] = value # Title - case 'f__5210_status': - artifactValues['f35c9c9b0991729c36acb41645fe81d1'] = value # Status - case 'f__5220_gattung': - artifactValues['f2fd7f8a81d5eb1a20371b9acfd1ab59'] = value # Genre - case 'f__5223_form__attribut': - artifactValues['f05bbd6e29a7d303e4370b04c12b3f75'] = value # Formattribute - case 'f__5226_art': - artifactValues['f593fa773a6ea458101ba2325a18abbe'] = value # artifact type - case 'f__523f_funktion': - artifactValues['f476ba24127d4dff1018acebf45a05f6'] = value # Function - case 'f__5240_formtyp': - artifactValues['fa7cfd9dbb3d2517c1898b3051d8dbed'] = value # Shape - case 'f__524g_gestalt': - artifactValues['f8309a21fa79bc6bd2506060b419d2df'] = value # Figure - case 'f__5362_hoehe': - # We map dimensions to Dimension entity. - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['height'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - - case 'f__5364_breite': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - - case 'f__5366_tiefe': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['depth'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - - case 'f__5368_laenge': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['length'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - - case 'f__5370_durchmesser': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['diameter'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - - case 'f__5380_gewicht': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['weight'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__538h_hist__gewicht': - dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['historical_weight'] # Type - dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension - dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID - case 'f__55ng_darst__schlagw_': - artifactValues['f6abbd4f39a6f79de5de2b14b98e51ff'] = value # Keywords - case 'f__5bes_beschreibung': - artifactValues['f26ad2bc1f084478cd7011f7b8451526'] = value # Description - case 'f__5ges_geschichte': - artifactValues['f40120d7c13ef02b486c69245f6c2306'] = value # History - case 'f__68an_abdruck_nr_': - artifactValues['fd3740649cc06f45677eb0546908cdac'] = value # Print Number - case 'f__8540_repro_nr_': - # We map images to Image entity - for item in value: - if item is not None: - # Replace dir paths in name - item = item.replace('Objekte/', 'objects/') - item = item.replace('Objekte\\', 'objects/') - item = item.replace('Objekte3\\', 'objects/') - item = item.replace('Objekte4\\', 'objects/') - item = item.replace('Objekte5\\', 'objects/') - item = item.replace('objekte5\\', 'objects/') - item = item.replace('Marken\\', 'marks/') - item = item.replace('Marken/', 'marks/') - imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image) - imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File - imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID - case 'f__stwv_statwerkverz': - artifactValues['fee0db94d62fae6370a89ff4757ff539'] = value # Catalogue_of_Works - case 'f__9990_kommentar': - artifactValues['fefe289aa0c9563a153be6da7d37e3ff'] = value # Comment - case 'f__9900_datum_erfassung': - digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date - case 'f__99ae_datum_aenderung': - digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date - case 'f__efbm_bem_erfassung': - digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note - case 'f__ptxt_plug_in_text': - artifactValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text - case _: - print(f'{key} is not a valid field, skipping.') - - # Create Production Place Assignment - productionPlaceAssignment = Entity(api=api, fields=productionPlaceAssignmentValues, bundle_id='b13bc6dc04d4bbdafb9536987eb43244') - api.save(productionPlaceAssignment) # Kai says, we can save all entities at once, but I save it instantly - - - # Create Dimension entities and add their UUIDs to a list - # because we link Artifact and Dimension over the UUID - dimension = [] - for key, value in dimensionValues.items(): - dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558') - api.save(dimensionItem) - dimension.append(value['f802fd7bf45be523a9b188411a591420'][0]) - - # Create Image entities and add their UUIDs to a list - # because we link Image Assignment and Image over the UUID - imageList = [] - for key, value in imageValues.items(): - imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') - api.save(imageItem) - imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) - - # Create Image Assignment entities and add their UUIDs to a list - # because we link Artifact and Image Assignment over the UUID - if imageList: - imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs - imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c') - api.save(imageAssignment) - - # Create Digitisation Process - digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b') - api.save(digitisationProcess) - - # Add the field values for reference - # UWAGA! Is the Value Production Place Assignment Correct? UWAGA! - artifactValues['f2676a0fb8db6ab62235328ae7c7a4b3'] = [productionPlaceAssignmentValues['f40cc95db3ccaa1dbbf27294338d9f07'][0]] # Production Place Assignment - artifactValues['fc700eb3f24f4f2a6c165128aa7117f1'] = dimension # Dimension - artifactValues['f7af1cd9c77448281dd7ecf29ba57e3e'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment - artifactValues['f5a3f90d920da3db4cfdbaa6264b0e89'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process - - # Create Artifact - artifact = Entity(api=api, fields=artifactValues, bundle_id='bd30c2c64a3caa8bb1628c780c3f24bb') - api.save(artifact) - - print(f'Created artifact {index}: {artifact.uri} of {len(artifactsTable)}') - - # Write log - processedRows = processedRows._append({'artifactId': artifactValues['f7e2a8a273ab3d577bf5854902550c09'][0], 'uuid': artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'][0], 'uri': artifact.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedArtifacts.csv', index=False) - -print('finish') diff --git a/22_importArtifactRelation.py b/22_importArtifactRelation.py index 4a5c148..b57b09e 100644 --- a/22_importArtifactRelation.py +++ b/22_importArtifactRelation.py @@ -5,84 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactRelation(api, engine): + print('Importing artifact relation...') -# Load the environment variables -load_dotenv() + tableName = "c__5007_beziehung" + bundleId = 'bf4a13ee46de57819f88834caaddc301' # Artifact relation assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "c__5007_beziehung" -bundleId = 'bf4a13ee46de57819f88834caaddc301' # Artifact relation assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.ioc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed artifact relation {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['ff7ebd530eb53efc489e80d9bbef293e'] = value # UUID - uuid = value[0] - case 'f__5008_bez_obj_nr_': - entityValues['f39d0e5207a375070d84b958017a62e8'] = value # Artifact Document Identifier - case 'f__bebm_bem_beziehung': - entityValues['f9cc743b648716684ccc3a7b9710d0ed'] = value # Note - case 'f__5007_beziehung': - entityValues['f4d3047b3b54285aa5a86183aedb1680'] = value # Relation - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['ff7ebd530eb53efc489e80d9bbef293e'] = value # UUID + fUuid = value[0] + case 'f__5008_bez_obj_nr_': + entityValues['f39d0e5207a375070d84b958017a62e8'] = value # Artifact Document Identifier + case 'f__bebm_bem_beziehung': + entityValues['f9cc743b648716684ccc3a7b9710d0ed'] = value # Note + case 'f__5007_beziehung': + entityValues['f4d3047b3b54285aa5a86183aedb1680'] = value # Relation + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created artifact relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact relation') diff --git a/24_importArtistAssignment.py b/24_importArtistAssignment.py index fce5f08..0bccaaa 100644 --- a/24_importArtistAssignment.py +++ b/24_importArtistAssignment.py @@ -5,79 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistAssignment(api, engine): + print('Importing artist assignment...') -# Load the environment variables -load_dotenv() + tableName = "c__ob30_bez_kuenstler" + bundleId = 'bc8826cc7d9c9373ce71cfc0251c2a4f' -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) -try: - processedRows = pd.read_csv(f'./logs/processedArtistAssignment.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) + # Load sources table + artistRelationsTable = pd.read_sql_table(tableName, con=engine) -# Load sources table -artistRelationsTable = pd.read_sql_table('c__ob30_bez_kuenstler', con=engine) - -artistRelationValues = {} - -# Create artistRelations -for index, row in artistRelationsTable.iterrows(): - # For every row in table... - if index < len(processedRows) and artistRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed artistRelation {artistRelationsTable.loc[index, 'id']}') - continue - # Create Entity property dicts - for key, value in row.items(): - print('value: ', value) - - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create artistRelations + for index, row in artistRelationsTable.iterrows(): + # For every row in table... + if index < len(processedRows) and artistRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed artistAssignment {artistRelationsTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - match key: - case 'id': - docId = value[0] - case 'f__uuid': - artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'] = value # UUID - case 'f__3100_name': - artistRelationValues['ff5bf58133f9351d03e2ee92b6f8bb7e'] = value # Artist Name - case 'f__3475_ber__funkt_': - artistRelationValues['fc0c7d8c6b736489210bc42ef0f1406a'] = value # Occupation - case 'f__ob30_bez_kuenstler': - artistRelationValues['f575d4f2c8ea5d37618cea708c2a7c5e'] = value # Relation - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + artistRelationValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + match key: + case 'id': + docId = value[0] + case 'f__uuid': + artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'] = value # UUID + case 'f__3100_name': + artistRelationValues['ff5bf58133f9351d03e2ee92b6f8bb7e'] = value # Artist Name + case 'f__3475_ber__funkt_': + artistRelationValues['fc0c7d8c6b736489210bc42ef0f1406a'] = value # Occupation + case 'f__ob30_bez_kuenstler': + artistRelationValues['f575d4f2c8ea5d37618cea708c2a7c5e'] = value # Relation + case _: + print(f'{key} is not a valid field, skipping.') - artistRelation = Entity(api=api, fields=artistRelationValues, bundle_id='bc8826cc7d9c9373ce71cfc0251c2a4f') - api.save(artistRelation) + artistRelation = Entity(api=api, fields=artistRelationValues, bundle_id=bundleId) + api.save(artistRelation) - print(f'Created artistRelation {index}: {artistRelation.uri} of {len(artistRelationsTable)}') + print(f'Created artist assignment {index}: {artistRelation.uri} of {len(artistRelationsTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'][0], 'uri': artistRelation.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processedArtistAssignment.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'][0], 'uri': artistRelation.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artist assignment') diff --git a/25_importMarkInformation.py b/25_importMarkInformation.py new file mode 100644 index 0000000..1a40933 --- /dev/null +++ b/25_importMarkInformation.py @@ -0,0 +1,82 @@ +import uuid # For UUID creation +from initDb import initDb # For database initialization +from wisski.api import Api, Pathbuilder, Entity # For WissKI API +import os # For environment variable loading +from dotenv import load_dotenv # For environment variable loading +import pandas as pd # For dataframe handling + +def importMarkInformation(api, engine): + print('Importing mark information...') + + tableName = "c__6760_markenart" + bundleId = 'bc7ce6906f78e760f22ff13226b1332d' # Mark information assignment + + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri']) + + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') + continue + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + continue + case 'f__uuid': + entityValues['f3b8aaf7e79229b4da8214d491e375ec'] = value # UUID + fUuid = value[0] + case 'f__5064_num__dat_': + entityValues['fe6921098808e68cae68f0858411826c'] = value # Artist Assignment + case 'f__6894_anbr_ort': + entityValues['f694ed57271ab7be57249e0ee5c41ba4'] = value # Location + case 'f__6700_mar_dok_nr_': + entityValues['fdd3380d4a11654f32687429796cabc3'] = value # Mark Document Number + case 'f__6760_markenart': + entityValues['fd381aa9c3ebdf417e6cbccd60ede279'] = value # Mark Type + case 'f__684c_bedeutung_bz': + entityValues['f4947de52885f517baef0cdf3cb53b61'] = value # Meaning Inspection Mark + case 'f__684a_bedeutung_mz': + entityValues['f542c4c945725c6fdc5ab6409a877f02'] = value # Meaning Master Mark + case 'f__6770_rosenb_nr_': + entityValues['f0ff7020a9c25ea2706875837fe61b04'] = value # Rosenberg Number + + case _: + print(f'{key} is not a valid field, skipping.') + + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) + + print(f'Created mark information {index}: {entity.uri} of {len(sqlTable)}') + + # Write log + processedRows = processedRows._append({'id': row['id'], 'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + + print('finish') diff --git a/25_importPhotographer.py b/25_importPhotographer.py deleted file mode 100644 index 0fbaead..0000000 --- a/25_importPhotographer.py +++ /dev/null @@ -1,90 +0,0 @@ -import uuid # For UUID creation -from initDb import initDb # For database initialization -from wisski.api import Api, Pathbuilder, Entity # For WissKI API -import os # For environment variable loading -from dotenv import load_dotenv # For environment variable loading -import pandas as pd # For dataframe handling - -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() - -# Load the environment variables -load_dotenv() - -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('default') - - -tableName = "c__8490_fotograf" -bundleId = 'b821fb6c518948b7f40d17803b6ce293' # Photographer assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value - continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__uuid': - entityValues['f6c3c3e35af2f2073fd517aabf88fa7c'] = value # UUID - docUuid = value[0] - case 'f__8490_fotograf': - entityValues['fe8f8b235f896862b74caa0fa8f3682d'] = value # Photographer - case 'f__8494_aufn_datum': - entityValues['f12c7538643314f0f46ba76a5140a87d'] = value # Recording Date - case 'f__8470_aufnahmenr_': - entityValues['ff6ec986fb4cc5a2f34deb7144f2f817'] = value # Recording number - case 'f__849r_repro_datei': # Image Assignment - entityValues['f24a609593559a904a0a0f2e215db584'] = value # Reproduction Number - case _: - print(f'{key} is not a valid field, skipping.') - - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) - - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') - - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': docUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - -print('finish') diff --git a/26_importPhotographer.py b/26_importPhotographer.py new file mode 100644 index 0000000..6a3550e --- /dev/null +++ b/26_importPhotographer.py @@ -0,0 +1,75 @@ +import uuid # For UUID creation +from initDb import initDb # For database initialization +from wisski.api import Api, Pathbuilder, Entity # For WissKI API +import os # For environment variable loading +from dotenv import load_dotenv # For environment variable loading +import pandas as pd # For dataframe handling + +def importPhotographer(api, engine): + print('Importing photographer...') + + tableName = "c__8490_fotograf" + bundleId = 'b821fb6c518948b7f40d17803b6ce293' # Photographer assignment + + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') + continue + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__uuid': + entityValues['f6c3c3e35af2f2073fd517aabf88fa7c'] = value # UUID + docUuid = value[0] + case 'f__8490_fotograf': + entityValues['fe8f8b235f896862b74caa0fa8f3682d'] = value # Photographer + case 'f__8494_aufn_datum': + entityValues['f12c7538643314f0f46ba76a5140a87d'] = value # Recording Date + case 'f__8470_aufnahmenr_': + entityValues['ff6ec986fb4cc5a2f34deb7144f2f817'] = value # Recording number + case 'f__849r_repro_datei': # Image Assignment + entityValues['f24a609593559a904a0a0f2e215db584'] = value # Reproduction Number + case _: + print(f'{key} is not a valid field, skipping.') + + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) + + print(f'Created Photographer {index}: {entity.uri} of {len(sqlTable)}') + + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': docUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + + print('finished importing photographer') diff --git a/98__r__importArtifactToArtistRelationRelation.py b/98__r__importArtifactToArtistRelationRelation.py index 7e0c960..c42008b 100644 --- a/98__r__importArtifactToArtistRelationRelation.py +++ b/98__r__importArtifactToArtistRelationRelation.py @@ -5,78 +5,64 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToArtistRelationRelation(api, engine): + print('importing artifact to artist relation relation') + tableName = "r__obj__ob30_bez_kuenstler" + bundleId = 'b8b4e3b3fb7e3b83cec037aea51814bf' # Artifact to artist relation relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Load the environment variables -load_dotenv() + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + entityValues = {} - -tableName = "r__obj__ob30_bez_kuenstler" -bundleId = 'b8b4e3b3fb7e3b83cec037aea51814bf' # Artifact to artist relation relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f92631e8a40aae0aa8adbe84ab5dc97f'] = value # Artifact UUID - fUuid = value[0] - case 'f__ob30_bez_kuenstler__uuid': - entityValues['f07e9587430d70bc46926488129ba4a8'] = value # Artist Relation UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f92631e8a40aae0aa8adbe84ab5dc97f'] = value # Artifact UUID + fUuid = value[0] + case 'f__ob30_bez_kuenstler__uuid': + entityValues['f07e9587430d70bc46926488129ba4a8'] = value # Artist Relation UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created artifact to artist relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to artist relation relation') diff --git a/98__r__importArtifactToClientAssignmentRelation.py b/98__r__importArtifactToClientAssignmentRelation.py index 4418c3a..fef6590 100644 --- a/98__r__importArtifactToClientAssignmentRelation.py +++ b/98__r__importArtifactToClientAssignmentRelation.py @@ -5,80 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToClientAssignmentRelation(api, engine): + print('importing artifact to client assignment relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__410a_auftraggeber" + bundleId = 'b20d53dcc2bad79457251a581611b43f' # Artifact to client assignment relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__obj__410a_auftraggeber" -bundleId = 'b20d53dcc2bad79457251a581611b43f' # Artifact to client assignment relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['fc369de9f2f7ac73585f7c967f415703'] = value # Artifact UUID - fUuid = value[0] - case 'f__410a_auftraggeber__uuid': - entityValues['fe65c6437d49877bad3de9ce31e19772'] = value # Client UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['fc369de9f2f7ac73585f7c967f415703'] = value # Artifact UUID + fUuid = value[0] + case 'f__410a_auftraggeber__uuid': + entityValues['fe65c6437d49877bad3de9ce31e19772'] = value # Client UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Client Assignment Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to client assignment relation') diff --git a/98__r__importArtifactToInspectionMarkLocationRelation.py b/98__r__importArtifactToInspectionMarkLocationRelation.py index a9abe1b..1c8f0d5 100644 --- a/98__r__importArtifactToInspectionMarkLocationRelation.py +++ b/98__r__importArtifactToInspectionMarkLocationRelation.py @@ -5,79 +5,65 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToInspectionMarkLocationRelation(api, engine): + print('importing artifact to inspection mark location relation') + tableName = "r__obj__67b0_bz_dok_nr" + bundleId = 'b7fe64e0326c107a1a4a705be08392fa' # Artifact to inspection mark location relation -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) + entityValues = {} -tableName = "r__obj__67b0_bz_dok_nr" -bundleId = 'b7fe64e0326c107a1a4a705be08392fa' # Artifact to inspection mark location relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f7ed714f705f51f4893427c7ba14dae8'] = value # Artifact UUID - fUuid = value[0] - case 'f__67b0_bz_dok_nr__uuid': - entityValues['f7a330c34474ecf06737a334dd754e8b'] = value # Inspection Mark location assignment - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f7ed714f705f51f4893427c7ba14dae8'] = value # Artifact UUID + fUuid = value[0] + case 'f__67b0_bz_dok_nr__uuid': + entityValues['f7a330c34474ecf06737a334dd754e8b'] = value # Inspection Mark location assignment + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Inspection Mark Location Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to inspection mark location relation') diff --git a/98__r__importArtifactToLiteratureReferenceAssignmentRelation.py b/98__r__importArtifactToLiteratureReferenceAssignmentRelation.py index a076749..59a1e7f 100644 --- a/98__r__importArtifactToLiteratureReferenceAssignmentRelation.py +++ b/98__r__importArtifactToLiteratureReferenceAssignmentRelation.py @@ -5,79 +5,65 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToLiteratureReferenceAssignmentRelation(api, engine): + print('importing artifact to literature reference assignment relation') + tableName = "r__obj__8330_lit_kurzt_" + bundleId = 'b6a7b7aad942ecff4b3beadf907d51c8' # Artifact to literature relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Load the environment variables -load_dotenv() + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') - - -tableName = "r__obj__8330_lit_kurzt_" -bundleId = 'b6a7b7aad942ecff4b3beadf907d51c8' # Artifact to literature relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f6c41b894b0a00c2c28860f513c5bb77'] = value # Artifact UUID - fUuid = value[0] - case 'f__8330_lit_kurzt___uuid': - entityValues['f5284765cef8e6974676adcb59791960'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f6c41b894b0a00c2c28860f513c5bb77'] = value # Artifact UUID + fUuid = value[0] + case 'f__8330_lit_kurzt___uuid': + entityValues['f5284765cef8e6974676adcb59791960'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Literature Reference Assignment Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to literature reference assignment relation') diff --git a/98__r__importArtifactToMarkInformationAssignmentRelation.py b/98__r__importArtifactToMarkInformationAssignmentRelation.py index 96a3bb9..0fb16b5 100644 --- a/98__r__importArtifactToMarkInformationAssignmentRelation.py +++ b/98__r__importArtifactToMarkInformationAssignmentRelation.py @@ -5,79 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToMarkInformationAssignmentRelation(api, engine): + print('importing artifact to mark information assignment relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__6760_markenart" + bundleId = 'b7112c2a7ea92a1d263d42d5572a05fc' # Artifact to mark information assignment relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) - -tableName = "r__obj__6760_markenart" -bundleId = 'b7112c2a7ea92a1d263d42d5572a05fc' # Artifact to mark information assignment relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['fcf4cbb8b01e4a02ffd041ba4040f890'] = value # Artifact UUID - fUuid = value[0] - case 'f__6760_markenart__uuid': - entityValues['fb6de3d2433630fc205fe1ef7f24639f'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['fcf4cbb8b01e4a02ffd041ba4040f890'] = value # Artifact UUID + fUuid = value[0] + case 'f__6760_markenart__uuid': + entityValues['fb6de3d2433630fc205fe1ef7f24639f'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Mark Information Assignment Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to mark information assignment relation') diff --git a/98__r__importArtifactToMaterialRelation.py b/98__r__importArtifactToMaterialRelation.py index f5ebb7c..267c82c 100644 --- a/98__r__importArtifactToMaterialRelation.py +++ b/98__r__importArtifactToMaterialRelation.py @@ -5,79 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToMaterialRelation(api, engine): + print('importing artifact to material relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__5280_material" + bundleId = 'b825aff7df3d48bd875e2a081c796305' # Artifact to material relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) - -tableName = "r__obj__5280_material" -bundleId = 'b825aff7df3d48bd875e2a081c796305' # Artifact to material relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f9f07bf63ccafd4eb2c0de24c73e1664'] = value # Artifact UUID - fUuid = value[0] - case 'f__5280_material__uuid': - entityValues['f820534abde4c2a2d19e0d19f7793cf0'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f9f07bf63ccafd4eb2c0de24c73e1664'] = value # Artifact UUID + fUuid = value[0] + case 'f__5280_material__uuid': + entityValues['f820534abde4c2a2d19e0d19f7793cf0'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Material Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to material relation') diff --git a/98__r__importArtifactToNumericeDateRelation.py b/98__r__importArtifactToNumericeDateRelation.py index e9ee47f..bf65b68 100644 --- a/98__r__importArtifactToNumericeDateRelation.py +++ b/98__r__importArtifactToNumericeDateRelation.py @@ -5,80 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToNumericeDateRelation(api, engine): + print('importing artifact to numeric date relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__5064_num__dat_" + bundleId = 'b795fcfa6c684fa707c236c4b0882ad7' # Artifact to numeric date relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__obj__5064_num__dat_" -bundleId = 'b795fcfa6c684fa707c236c4b0882ad7' # Artifact to numeric date relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['fc69105d5a6931fc1d2b53cee7ef8b22'] = value # Artifact UUID - fUuid = value[0] - case 'f__5064_num__dat___uuid': - entityValues['fff143b7bfc1308cac53789304a1aff2'] = value # Numeric Date UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['fc69105d5a6931fc1d2b53cee7ef8b22'] = value # Artifact UUID + fUuid = value[0] + case 'f__5064_num__dat___uuid': + entityValues['fff143b7bfc1308cac53789304a1aff2'] = value # Numeric Date UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Numeric Date Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to numeric date relation') diff --git a/98__r__importArtifactToPhotographRelation.py b/98__r__importArtifactToPhotographRelation.py index 25c235c..da9159e 100644 --- a/98__r__importArtifactToPhotographRelation.py +++ b/98__r__importArtifactToPhotographRelation.py @@ -5,79 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToPhotographRelation(api, engine): + print('importing artifact to photograph relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__8490_fotograf" + bundleId = 'b63cd713e60b6e5bc3b2235dffc0dba9' # Artifact to photograph relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) - -tableName = "r__obj__8490_fotograf" -bundleId = 'b63cd713e60b6e5bc3b2235dffc0dba9' # Artifact to photograph relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f88af5d8b4e289c0cde4df32f76a2804'] = value # Artifact UUID - fUuid = value[0] - case 'f__8490_fotograf__uuid': - entityValues['fe2f0af4ba38024fb0f796d4a98af511'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f88af5d8b4e289c0cde4df32f76a2804'] = value # Artifact UUID + fUuid = value[0] + case 'f__8490_fotograf__uuid': + entityValues['fe2f0af4ba38024fb0f796d4a98af511'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Photograph Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to photograph relation') diff --git a/98__r__importArtifactToRelationRelation.py b/98__r__importArtifactToRelationRelation.py index 627c01c..4cc0e7f 100644 --- a/98__r__importArtifactToRelationRelation.py +++ b/98__r__importArtifactToRelationRelation.py @@ -5,80 +5,67 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToRelationRelation(api, engine): + print('importing artifact to relation relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__5007_beziehung" + bundleId = 'bb878dd9c44c83a70fbd151f1dc06b4d' # Artifact to relation relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__obj__5007_beziehung" -bundleId = 'bb878dd9c44c83a70fbd151f1dc06b4d' # Artifact to relation relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['fe537502d55fd4a4482449a0174a3d98'] = value # Artifact UUID - fUuid = value[0] - case 'f__5007_beziehung__uuid': - entityValues['f82f33fa9640d894170c5221d02f583a'] = value # Relation UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['fe537502d55fd4a4482449a0174a3d98'] = value # Artifact UUID + fUuid = value[0] + case 'f__5007_beziehung__uuid': + entityValues['f82f33fa9640d894170c5221d02f583a'] = value # Relation UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Relation Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) -print('finish') + print('finished importing artifact to relation relation') diff --git a/98__r__importArtifactToSourceRelation.py b/98__r__importArtifactToSourceRelation.py index d7cdf90..198a091 100644 --- a/98__r__importArtifactToSourceRelation.py +++ b/98__r__importArtifactToSourceRelation.py @@ -5,78 +5,65 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToSourceRelation(api, engine): + print('importing artifact to source relation') -# Load the environment variables -load_dotenv() + tableName = "r__obj__8130_que_kurzt_" + bundleId = 'bcf720dc0b796043915d6da536414451' # Artifact to source relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) - -tableName = "r__obj__8130_que_kurzt_" -bundleId = 'bcf720dc0b796043915d6da536414451' # Artifact to source relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['fc8eb74a6ba0c51a82972ff19fec53e8'] = value # Artifact UUID - fUuid = value[0] - case 'f__8130_que_kurzt___uuid': - entityValues['fbfbf828330ed4ec85797ea274f73bb8'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['fc8eb74a6ba0c51a82972ff19fec53e8'] = value # Artifact UUID + fUuid = value[0] + case 'f__8130_que_kurzt___uuid': + entityValues['fbfbf828330ed4ec85797ea274f73bb8'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Source Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + print('finished importing artifact to source relation') diff --git a/98__r__importArtifactToStatusAdministratorRelation.py b/98__r__importArtifactToStatusAdministratorRelation.py index 1815ddb..4a7764b 100644 --- a/98__r__importArtifactToStatusAdministratorRelation.py +++ b/98__r__importArtifactToStatusAdministratorRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtifactToStatusAdministratorRelation(api, engine): + print('importing artifact to status administrator relation') + test = False + tableName = "r__obj__ob28_status_verwalt_" + bundleId = 'bd4922f100ab534fc1213f767770ed6d' # Artifact to status adminstrator relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Load the environment variables -load_dotenv() + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') - -test = False -tableName = "r__obj__ob28_status_verwalt_" -bundleId = 'bd4922f100ab534fc1213f767770ed6d' # Artifact to status adminstrator relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__obj__uuid': - entityValues['f355304194b190e2fee22a99d54ebc92'] = value # Artifact UUID - fUuid = value[0] - case 'f__ob28_status_verwalt___uuid': - entityValues['fcc8a9758ce7a2659bfe96242ec4a15e'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__obj__uuid': + entityValues['f355304194b190e2fee22a99d54ebc92'] = value # Artifact UUID + fUuid = value[0] + case 'f__ob28_status_verwalt___uuid': + entityValues['fcc8a9758ce7a2659bfe96242ec4a15e'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artifact to Status Administrator Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) - if test: - exit() + if test: + exit() -print('finish') + print('finished importing artifact to status administrator relation') diff --git a/98__r__importArtistToBirthRelation.py b/98__r__importArtistToBirthRelation.py index 61f5e16..c1c6791 100644 --- a/98__r__importArtistToBirthRelation.py +++ b/98__r__importArtistToBirthRelation.py @@ -5,83 +5,70 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToBirthRelation(api, engine): + print('importing artist to birth relation') -# Load the environment variables -load_dotenv() + test = False + tableName = "r__kue__3270_geb_datum" + bundleId = 'b82e4404cdf641db57c03d7e3b23947c' # Artist to birth relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -test = False -tableName = "r__kue__3270_geb_datum" -bundleId = 'b82e4404cdf641db57c03d7e3b23947c' # Artist to birth relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f07d608ae6abf891e54c0f57b5f78507'] = value # Date - fUuid = value[0] - case 'f__3270_geb_datum__uuid': - entityValues['f70978f842342d920db490d420339dae'] = value # Dating - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f07d608ae6abf891e54c0f57b5f78507'] = value # Date + fUuid = value[0] + case 'f__3270_geb_datum__uuid': + entityValues['f70978f842342d920db490d420339dae'] = value # Dating + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Birth Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() -print('finish') + print('finished importing artist to birth relation') diff --git a/98__r__importArtistToDeathRelation.py b/98__r__importArtistToDeathRelation.py index 9474d15..d0a0b97 100644 --- a/98__r__importArtistToDeathRelation.py +++ b/98__r__importArtistToDeathRelation.py @@ -5,81 +5,68 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToDeathRelation(api, engine): + print('importing artist to death relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__3330_todes_dat_" + bundleId = 'b91ed11c8063a363063582f001a3f5a2' # Artist to death relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -test = False - -tableName = "r__kue__3330_todes_dat_" -bundleId = 'b91ed11c8063a363063582f001a3f5a2' # Artist to death relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f2b9ebb823502c1bba835d2f57102815'] = value # Artist UUID - fUuid = value[0] - case 'f__3330_todes_dat___uuid': - entityValues['f6286ce1789410919bd6fc3f1a7f2e05'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f2b9ebb823502c1bba835d2f57102815'] = value # Artist UUID + fUuid = value[0] + case 'f__3330_todes_dat___uuid': + entityValues['f6286ce1789410919bd6fc3f1a7f2e05'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Death Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to death relation') diff --git a/98__r__importArtistToGoldsmithRelation.py b/98__r__importArtistToGoldsmithRelation.py index c1e11cb..ee61322 100644 --- a/98__r__importArtistToGoldsmithRelation.py +++ b/98__r__importArtistToGoldsmithRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToGoldsmithRelation(api, engine): + print('importing artist to goldsmith relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__3007_bezieh__zu_gs" + bundleId = 'b464b2b43aaa27aaba71e337c9af649c' # Artist to goldsmith relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__kue__3007_bezieh__zu_gs" -bundleId = 'b464b2b43aaa27aaba71e337c9af649c' # Artist to goldsmith relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f972dfd248e362846f4cb5cc946eefc2'] = value # Date - fUuid = value[0] - case 'f__3007_bezieh__zu_gs__uuid': - entityValues['f37c88dc7451b8d1b82f702ef64f8b05'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f972dfd248e362846f4cb5cc946eefc2'] = value # Date + fUuid = value[0] + case 'f__3007_bezieh__zu_gs__uuid': + entityValues['f37c88dc7451b8d1b82f702ef64f8b05'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Goldsmith Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to goldsmith relation') diff --git a/98__r__importArtistToLiteratureReferenceRelation.py b/98__r__importArtistToLiteratureReferenceRelation.py index bbb0dd6..e6cc265 100644 --- a/98__r__importArtistToLiteratureReferenceRelation.py +++ b/98__r__importArtistToLiteratureReferenceRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToLiteratureReferenceRelation(api, engine): + print('importing artist to literature reference relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__8330_lit_kurzt_" + bundleId = 'b7a87e3f3d5f671c1f163101bff30eb6' # Artist to literature relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__kue__8330_lit_kurzt_" -bundleId = 'b7a87e3f3d5f671c1f163101bff30eb6' # Artist to literature relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f0b9b134818c592f93083d444817dffb'] = value # Date - fUuid = value[0] - case 'f__8330_lit_kurzt___uuid': - entityValues['f70fb4157e3ef66e4d1ed78880f092b2'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f0b9b134818c592f93083d444817dffb'] = value # Date + fUuid = value[0] + case 'f__8330_lit_kurzt___uuid': + entityValues['f70fb4157e3ef66e4d1ed78880f092b2'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Literature Reference Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to literature reference relation') diff --git a/98__r__importArtistToMentionedRelation.py b/98__r__importArtistToMentionedRelation.py index be49cf1..0c4dc71 100644 --- a/98__r__importArtistToMentionedRelation.py +++ b/98__r__importArtistToMentionedRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToMentionedRelation(api, engine): + print('importing artist to mentioned relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__7060_erwaehnt__datum_" + bundleId = 'bc2b0ddca583320a56a67b304dc0a045' # Artist to mentioned relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__kue__7060_erwaehnt__datum_" -bundleId = 'bc2b0ddca583320a56a67b304dc0a045' # Artist to mentioned relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f47b1ffe8394f389497b9e23407ad72f'] = value # Date - fUuid = value[0] - case 'f__7060_erwaehnt__datum___uuid': - entityValues['fabb90d487512fc5bf8d7379ff2d8bdb'] = value # Mentioned UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f47b1ffe8394f389497b9e23407ad72f'] = value # Date + fUuid = value[0] + case 'f__7060_erwaehnt__datum___uuid': + entityValues['fabb90d487512fc5bf8d7379ff2d8bdb'] = value # Mentioned UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Mentioned Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to mentioned relation') diff --git a/98__r__importArtistToOriginRelation.py b/98__r__importArtistToOriginRelation.py index 7ad917a..5addacb 100644 --- a/98__r__importArtistToOriginRelation.py +++ b/98__r__importArtistToOriginRelation.py @@ -5,81 +5,68 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToOriginRelation(api, engine): + print('importing artist to origin relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__3204_herkunft" + bundleId = 'b5cf6b3e6fd2e4b5575da4347999d6ea' # Artist to origin relation + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -test = False - -tableName = "r__kue__3204_herkunft" -bundleId = 'b5cf6b3e6fd2e4b5575da4347999d6ea' # Artist to origin relation -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f40e702ecb7fe968c77c9f2ed0f1280c'] = value # Artist UUID - fUuid = value[0] - case 'f__3204_herkunft__uuid': - entityValues['f53bcd587a769e93ea54a34e6de4867d'] = value # Origin UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f40e702ecb7fe968c77c9f2ed0f1280c'] = value # Artist UUID + fUuid = value[0] + case 'f__3204_herkunft__uuid': + entityValues['f53bcd587a769e93ea54a34e6de4867d'] = value # Origin UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Origin Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to origin relation') diff --git a/98__r__importArtistToWorkshopRelation.py b/98__r__importArtistToWorkshopRelation.py index bfb6984..61c24fb 100644 --- a/98__r__importArtistToWorkshopRelation.py +++ b/98__r__importArtistToWorkshopRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importArtistToWorkshopRelation(api, engine): + print('importing artist to workshop relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__kue__nfws_forts_werkst_" + bundleId = 'becb95326a733bdbd0c2dd3d36e3399d' # Artist to workshop relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__kue__nfws_forts_werkst_" -bundleId = 'becb95326a733bdbd0c2dd3d36e3399d' # Artist to workshop relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__kue__uuid': - entityValues['f1f107b495d9cf3f349932f2c6535505'] = value # Date - fUuid = value[0] - case 'f__nfws_forts_werkst___uuid': - entityValues['fc53912a0acb388e04eb6684eda209f1'] = value # Workshop UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__kue__uuid': + entityValues['f1f107b495d9cf3f349932f2c6535505'] = value # Date + fUuid = value[0] + case 'f__nfws_forts_werkst___uuid': + entityValues['fc53912a0acb388e04eb6684eda209f1'] = value # Workshop UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Artist to Workshop Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing artist to workshop relation') diff --git a/98__r__importInspectionMarkDatingInformationAssignmentRelation.py b/98__r__importInspectionMarkDatingInformationAssignmentRelation.py index effdf0f..9072ae8 100644 --- a/98__r__importInspectionMarkDatingInformationAssignmentRelation.py +++ b/98__r__importInspectionMarkDatingInformationAssignmentRelation.py @@ -5,82 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMarkDatingInformationAssignmentRelation(api, engine): + print('importing inspection mark dating information assignment relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__bez__68dm_datierung_marke" + bundleId = 'b1fee832598b2d42ed17a927dad43b90' # Inspection Mark to dating information assignment relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__bez__68dm_datierung_marke" -bundleId = 'b1fee832598b2d42ed17a927dad43b90' # Inspection Mark to dating information assignment relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__bez__uuid': - entityValues['fac07ebf9c19d09995cc13ae1ba6f362'] = value # Date - fUuid = value[0] - case 'f__68dm_datierung_marke__uuid': - entityValues['ffd43be34e81e0dbfc1b8cccc5f32056'] = value # Dating - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__bez__uuid': + entityValues['fac07ebf9c19d09995cc13ae1ba6f362'] = value # Date + fUuid = value[0] + case 'f__68dm_datierung_marke__uuid': + entityValues['ffd43be34e81e0dbfc1b8cccc5f32056'] = value # Dating + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Inspection Mark to Dating Information Assignment Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing inspection mark dating information assignment relation') diff --git a/98__r__importInspectionMarkRelationRelation.py b/98__r__importInspectionMarkRelationRelation.py index 3ba035b..e794134 100644 --- a/98__r__importInspectionMarkRelationRelation.py +++ b/98__r__importInspectionMarkRelationRelation.py @@ -5,83 +5,70 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMarkRelationRelation(api, engine): + print('importing inspection mark relation relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__bez__67b7_beziehung" + bundleId = 'bc8dcd233a9b539db407bad219715988' # Inspection Mark Relation Relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__bez__67b7_beziehung" -bundleId = 'bc8dcd233a9b539db407bad219715988' # Inspection Mark Relation Relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] - case 'f__bez__uuid': - entityValues['fb9cc78d2351179c5f2f49b3b01be40b'] = value # Date - fUuid = value[0] - case 'f__67b7_beziehung__uuid': - entityValues['f468e7d8e91f04b902c6bc79fe365074'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + case 'f__bez__uuid': + entityValues['fb9cc78d2351179c5f2f49b3b01be40b'] = value # Date + fUuid = value[0] + case 'f__67b7_beziehung__uuid': + entityValues['f468e7d8e91f04b902c6bc79fe365074'] = value # Note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Inspection Mark to Relation Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing inspection mark relation relation') diff --git a/98__r__importInspectionMarkToLiteratureReferenceRelation.py b/98__r__importInspectionMarkToLiteratureReferenceRelation.py index 9a0e563..b9a3e67 100644 --- a/98__r__importInspectionMarkToLiteratureReferenceRelation.py +++ b/98__r__importInspectionMarkToLiteratureReferenceRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importInspectionMarkToLiteratureReferenceRelation(api, engine): + print('importing inspection mark to literature reference relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__bez__8330_lit_kurzt_" + bundleId = 'b32fc778865a1ffd5b165515425f38c6' # Inspection Mark to Dating Assignment -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=['docId', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__bez__8330_lit_kurzt_" -bundleId = 'b32fc778865a1ffd5b165515425f38c6' # Inspection Mark to Dating Assignment - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__bez__uuid': - entityValues['f8670edfe030f375ca0b8b275a394511'] = value # Date - fUuid = value[0] - case 'f__8330_lit_kurzt___uuid': - entityValues['fa52476d733d0d106406864245d613b8'] = value # Literature Reference Assignment - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__bez__uuid': + entityValues['f8670edfe030f375ca0b8b275a394511'] = value # Date + fUuid = value[0] + case 'f__8330_lit_kurzt___uuid': + entityValues['fa52476d733d0d106406864245d613b8'] = value # Literature Reference Assignment + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Inspection Mark to Literature Reference Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing inspection mark to literature reference relation') diff --git a/98__r__importLiteratureToJournalRelation.py b/98__r__importLiteratureToJournalRelation.py index 91df694..8167486 100644 --- a/98__r__importLiteratureToJournalRelation.py +++ b/98__r__importLiteratureToJournalRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importLiteratureToJournalRelation(api, engine): + print('importing literature to journal relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__lit__8310_zeitschrift" + bundleId = 'b6c2ce0add1e7999f48d66b7ef1a4a26' # Literature to journal relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__lit__8310_zeitschrift" -bundleId = 'b6c2ce0add1e7999f48d66b7ef1a4a26' # Literature to journal relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__lit__uuid': - entityValues['fc751b683ba51648f4e7557e37e18228'] = value # Literature UUID - fUuid = value[0] - case 'f__8310_zeitschrift__uuid': - entityValues['fae46e3ca92e3a84b36df823fe0323bb'] = value # Journal UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__lit__uuid': + entityValues['fc751b683ba51648f4e7557e37e18228'] = value # Literature UUID + fUuid = value[0] + case 'f__8310_zeitschrift__uuid': + entityValues['fae46e3ca92e3a84b36df823fe0323bb'] = value # Journal UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Literature to Journal Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing literature to journal relation') diff --git a/98__r__importLiteratureToParentPublicationRelation.py b/98__r__importLiteratureToParentPublicationRelation.py index a7473de..cd26478 100644 --- a/98__r__importLiteratureToParentPublicationRelation.py +++ b/98__r__importLiteratureToParentPublicationRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importLiteratureToParentPublicationRelation(api, engine): + print('importing literature to parent publication relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__lit__8292_uebergeordn_publ_" + bundleId = 'b2adaaa15714d83ea83cd3333af437df' # Literature to parent publication relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__lit__8292_uebergeordn_publ_" -bundleId = 'b2adaaa15714d83ea83cd3333af437df' # Literature to parent publication relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__lit__uuid': - entityValues['f1ecd1cf9be1081507f9c8f3758bafe9'] = value # Date - fUuid = value[0] - case 'f__8292_uebergeordn_publ___uuid': - entityValues['f9997e4bbacb1c26a945825cfe5b6de2'] = value # - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__lit__uuid': + entityValues['f1ecd1cf9be1081507f9c8f3758bafe9'] = value # Date + fUuid = value[0] + case 'f__8292_uebergeordn_publ___uuid': + entityValues['f9997e4bbacb1c26a945825cfe5b6de2'] = value # + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Literature to Parent Publication Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing literature to parent publication relation') diff --git a/98__r__importMarkToDatingRelation.py b/98__r__importMarkToDatingRelation.py index 048aaa4..dbf1df4 100644 --- a/98__r__importMarkToDatingRelation.py +++ b/98__r__importMarkToDatingRelation.py @@ -5,82 +5,66 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarkToDatingRelation(api, engine): + print('importing mark to dating relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__mar__68dm_datierung_marke" + bundleId = 'b105b749b25de3aa55329b82fe18c18d' # Mark to dating relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__mar__68dm_datierung_marke" -bundleId = 'b105b749b25de3aa55329b82fe18c18d' # Mark to dating relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__mar__uuid': - entityValues['f11c6eedcfc833dabffd356f57be7e15'] = value # Date - fUuid = value[0] - case 'f__68dm_datierung_marke__uuid': - entityValues['f2b469f3a10721ab891e01b1d9817612'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__mar__uuid': + entityValues['f11c6eedcfc833dabffd356f57be7e15'] = value # Date + fUuid = value[0] + case 'f__68dm_datierung_marke__uuid': + entityValues['f2b469f3a10721ab891e01b1d9817612'] = value # Note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Mark to Dating Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing mark to dating relation') diff --git a/98__r__importMarkToLiteratureRelation.py b/98__r__importMarkToLiteratureRelation.py index 2e0d30b..05a26fc 100644 --- a/98__r__importMarkToLiteratureRelation.py +++ b/98__r__importMarkToLiteratureRelation.py @@ -5,83 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarkToLiteratureRelation(api, engine): + print('importing mark to literature relation') + test = False -test = True + tableName = "r__mar__8330_lit_kurzt_" + bundleId = 'bd58cc7d59ce9f3e593e758a28dfcf4a' # Mark to literature relation -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) - -tableName = "r__mar__8330_lit_kurzt_" -bundleId = 'bd58cc7d59ce9f3e593e758a28dfcf4a' # Mark to literature relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__mar__uuid': - entityValues['f4fccc9bad7fc559c153095bdcb32eeb'] = value # Mark UUID - fUuid = value[0] - case 'f__8330_lit_kurzt___uuid': - entityValues['f19ffb27810f7d14694afb54dd359451'] = value # Literature UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__mar__uuid': + entityValues['f4fccc9bad7fc559c153095bdcb32eeb'] = value # Mark UUID + fUuid = value[0] + case 'f__8330_lit_kurzt___uuid': + entityValues['f19ffb27810f7d14694afb54dd359451'] = value # Literature UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Mark to Literature Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing mark to literature relation') diff --git a/98__r__importMarkToMarkInformationRelation.py b/98__r__importMarkToMarkInformationRelation.py index dd69c36..23c9933 100644 --- a/98__r__importMarkToMarkInformationRelation.py +++ b/98__r__importMarkToMarkInformationRelation.py @@ -5,81 +5,68 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarkToMarkInformationRelation(api, engine): + print('importing mark to mark information relation') + test = False + tableName = "r__mar__6760_markenart" + bundleId = 'b241e8063b9259428967fa4ff134a8bd' # Mark to mark information relation -# Load the environment variables -load_dotenv() + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -test = False -tableName = "r__mar__6760_markenart" -bundleId = 'b241e8063b9259428967fa4ff134a8bd' # Mark to mark information relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__mar__uuid': - entityValues['fa64f8812c3c784b2d91454bc9a88279'] = value # Mark UUID - fUuid = value[0] - case 'f__6760_markenart__uuid': - entityValues['f9d5d6723ea78253330dd8e4b346cac6'] = value # Mark information assignment uuidNote - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__mar__uuid': + entityValues['fa64f8812c3c784b2d91454bc9a88279'] = value # Mark UUID + fUuid = value[0] + case 'f__6760_markenart__uuid': + entityValues['f9d5d6723ea78253330dd8e4b346cac6'] = value # Mark information assignment uuidNote + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Mark to Mark Information Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing mark to mark information relation') diff --git a/98__r__importMarkToSourceRelation.py b/98__r__importMarkToSourceRelation.py index 7c19d6f..fc534d8 100644 --- a/98__r__importMarkToSourceRelation.py +++ b/98__r__importMarkToSourceRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importMarkToSourceRelation(api, engine): + print('importing mark to source relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__mar__8130_que_kurzt_" + bundleId = 'b0edbf644e07765a5ae319802ec0289b' # Mark to source relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__mar__8130_que_kurzt_" -bundleId = 'b0edbf644e07765a5ae319802ec0289b' # Mark to source relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__mar__uuid': - entityValues['ffe35cef0c5d28bbebe195436706fc7c'] = value # Date - fUuid = value[0] - case 'f__8130_que_kurzt___uuid': - entityValues['f86e4b7f52add5640b824a601c66a2f6'] = value # Note - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__mar__uuid': + entityValues['ffe35cef0c5d28bbebe195436706fc7c'] = value # Date + fUuid = value[0] + case 'f__8130_que_kurzt___uuid': + entityValues['f86e4b7f52add5640b824a601c66a2f6'] = value # Note + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Mark to Source Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing mark to source relation') diff --git a/98__r__importSourceToDateRelation.py b/98__r__importSourceToDateRelation.py index c527dfa..2d1a21e 100644 --- a/98__r__importSourceToDateRelation.py +++ b/98__r__importSourceToDateRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importSourceToDateRelation(api, engine): + print('importing source to date relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__que__8100_datum" + bundleId = 'b4b8ba242075bf2c778894911c7f3264' # Source to date relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__que__8100_datum" -bundleId = 'b4b8ba242075bf2c778894911c7f3264' # Source to date relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__que__uuid': - entityValues['f2e8d1b76c8b196c8deb9e0abe90a5b3'] = value # Source UUID - fUuid = value[0] - case 'f__8100_datum__uuid': - entityValues['ff5ac62e6327599566d4474e18423265'] = value # Date UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__que__uuid': + entityValues['f2e8d1b76c8b196c8deb9e0abe90a5b3'] = value # Source UUID + fUuid = value[0] + case 'f__8100_datum__uuid': + entityValues['ff5ac62e6327599566d4474e18423265'] = value # Date UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Source to Date Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing source to date relation') diff --git a/98__r__importSourceToLiteratureReferenceAssignmentRelation.py b/98__r__importSourceToLiteratureReferenceAssignmentRelation.py index d69afd2..1ee0a93 100644 --- a/98__r__importSourceToLiteratureReferenceAssignmentRelation.py +++ b/98__r__importSourceToLiteratureReferenceAssignmentRelation.py @@ -5,82 +5,69 @@ import os # For environment variable loading from dotenv import load_dotenv # For environment variable loading import pandas as pd # For dataframe handling -# Initialize the database -print('Initializing the database...') -engine, metadata, Session = initDb(True, './schemas/') -if engine == False: - print('Database initialization failed.') - exit() +def importSourceToLiteratureReferenceAssignmentRelation(api, engine): + print('importing source to literature reference assignment relation') + test = False -# Load the environment variables -load_dotenv() + tableName = "r__que__8330_lit_kurzt_" + bundleId = 'bed2f320214a0344287c6c4db40e9331' # Source to literature reference assignemnt relation -# Initialize the WissKI API -print('Initializing the WissKI API...') -api_url = os.getenv('API_URL') -auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD')) -headers = {"Cache-Control": "no-cache"} -api = Api(api_url, auth, headers) -api.pathbuilder = api.get_pathbuilder('relations') + try: + processedRows = pd.read_csv(f'./logs/{tableName}.csv') + except FileNotFoundError: + processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri']) -test = False + # Load sources table + sqlTable = pd.read_sql_table(tableName, con=engine) -tableName = "r__que__8330_lit_kurzt_" -bundleId = 'bed2f320214a0344287c6c4db40e9331' # Source to literature reference assignemnt relation - -try: - processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') -except FileNotFoundError: - processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) - -# Load sources table -sqlTable = pd.read_sql_table(tableName, con=engine) - -entityValues = {} - -# Create entities -for index, row in sqlTable.iterrows(): - # For every row in table... - if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: - # skip if already processed - print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') - continue - # Create Entity property dicts entityValues = {} - for key, value in row.items(): - # For every column in row... - if (value is None) or (value == ''): - # skip if cell has no value + + # Create entities + for index, row in sqlTable.iterrows(): + # For every row in table... + if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']: + # skip if already processed + print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}') continue - # Properties of an entity have to be an array, so... - if '&' in str(value): - # ...Explode "&"-separated values to array items - value = [x.strip() for x in str(value).split('&')] - else: - # ...Or parse to array - value = [value] - # Map columns to fields. We use assignments for reification. - docId = '' - match key: - case 'id': - docId = value[0] - case 'f__que__uuid': - entityValues['faeb9c96c23eadd1a58df9ecd2154b68'] = value # Source UUID - fUuid = value[0] - case 'f__8330_lit_kurzt___uuid': - entityValues['fc15a069f1a7694c13107a348d3b7a39'] = value # Literature reference assignment UUID - case _: - print(f'{key} is not a valid field, skipping.') + # Create Entity property dicts + entityValues = {} + for key, value in row.items(): + # For every column in row... + if (value is None) or (value == ''): + # skip if cell has no value + continue + # Properties of an entity have to be an array, so... + value = str(value).replace('&###{{new_line}}###'.format(), '&') + value = str(value).replace('###{{new_line}}###', '&') + value = str(value).replace(' & ', '&') + if '&' in str(value): + # ...Explode "&"-separated values to array items + value = [x.strip() for x in str(value).split('&')] + else: + # ...Or parse to array + value = [value] + # Map columns to fields. We use assignments for reification. + docId = '' + match key: + case 'id': + docId = value[0] + case 'f__que__uuid': + entityValues['faeb9c96c23eadd1a58df9ecd2154b68'] = value # Source UUID + fUuid = value[0] + case 'f__8330_lit_kurzt___uuid': + entityValues['fc15a069f1a7694c13107a348d3b7a39'] = value # Literature reference assignment UUID + case _: + print(f'{key} is not a valid field, skipping.') - # Create Material - entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) - api.save(entity) + # Create Material + entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) + api.save(entity) - print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') + print(f'Created Source to Literature Reference Assignment Relation {index}: {entity.uri} of {len(sqlTable)}') - # Write log - processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) - processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) - if test: - exit() -print('finish') + # Write log + processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) + processedRows.to_csv(f'./logs/{tableName}.csv', index=False) + if test: + exit() + print('finished importing source to literature reference assignment relation') diff --git a/initDb.py b/initDb.py index c12d3a5..81de430 100644 --- a/initDb.py +++ b/initDb.py @@ -19,7 +19,7 @@ def initDb(_production, schemaDir): return (False, False) if _production: - dbName = 'ngk' + dbName = 'ngk_data_alt' else: dbName = 'testngk' diff --git a/initSchemas.py b/initSchemas.py index b30e116..0bcbf04 100644 --- a/initSchemas.py +++ b/initSchemas.py @@ -20,7 +20,7 @@ def createClass(name, columns): tableName = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_') # Transform columns and add prefix - attrs = {'__tablename__': tableName} + attrs = {'__tablename__': tableName, '__table_args__': {'extend_existing': True}} attrs.update({prop.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')','_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_'): (Column(String(36), primary_key=True) if prop.lower() == 'uuid' else Column(Text)) for prop in columns}) # If 'uuid' is not in columns, add 'id' as primary key @@ -30,9 +30,6 @@ def createClass(name, columns): # Create SQLAlchemy class cls = type(className, (Base,), attrs) - # Define the table with extend_existing=True - Table(tableName, Base.metadata, extend_existing=True) - return cls def initClassesFromSchemas(schemaDir): diff --git a/requirements.txt b/requirements.txt index c3238a8..c3a58e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ pandas pymysql sqlalchemy tqdm -wisski_py +