new commit

This commit is contained in:
rnsrk 2025-09-09 10:16:31 +02:00
parent da296f8a64
commit e46a9fd4ec
69 changed files with 4199 additions and 4805 deletions

2
.gitignore vendored
View file

@ -11,3 +11,5 @@ wisski_py
__pycache__ __pycache__
logs/* logs/*
.venv .venv
.env
.vscode

15
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

160
00_start.py Normal file
View file

@ -0,0 +1,160 @@
from importlib import import_module
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
from time import sleep
# Import entities
material_module = import_module("01_importMaterialsAndTechnique")
administrator_module = import_module("02_importAdministrator")
administrator_status_module = import_module("03_importAdministratorStatus")
source_module = import_module("03_importSource")
artist_source_reference_assignment_module = import_module("04_importArtistSourceReferenceAssignment")
marks_module = import_module("04_importMarks")
source_reference_assignment_module = import_module("04_importSourceReferenceAssignment")
artist_module = import_module("05_importArtist")
literature_module = import_module("06_importLiterature")
inspection_mark_module = import_module("07_importInspectionMark")
journal_assignment_module = import_module("07_importJournalAssignment")
literature_reference_assignment_module = import_module("07_importLiteratureReferenceAssignment")
parent_literature_assignment_module = import_module("07_importParentLiteratureAssignment")
inspection_mark_location_module = import_module("08_importInspectionMarkLocation")
inspection_mark_relation_module = import_module("09_importInspectionMarkRelation")
mark_dating_info_module = import_module("10_importMarkDatingInfo")
birth_module = import_module("12_importBirth")
death_module = import_module("13_importDeath")
dating_module = import_module("14_importDating")
goldsmith_relation_module = import_module("15_importGoldsmithRelation")
client_module = import_module("16_importClient")
mentioned_module = import_module("17_importMentioned")
num_dating_module = import_module("18_importNumDating")
origin_assignment_module = import_module("19_importOriginAssignment")
workshops_module = import_module("20_importWorkshops")
artifacts_module = import_module("21_importArtifacts")
artifact_relation_module = import_module("22_importArtifactRelation")
artist_assignment_module = import_module("24_importArtistAssignment")
mark_information_module = import_module("25_importMarkInformation")
photographer_module = import_module("26_importPhotographer")
# Import relations
artifact_to_artist_relation_module = import_module("98__r__importArtifactToArtistRelationRelation")
artifact_to_client_assignment_relation_module = import_module("98__r__importArtifactToClientAssignmentRelation")
artifact_to_inspection_mark_location_relation_module = import_module("98__r__importArtifactToInspectionMarkLocationRelation")
artifact_to_literature_reference_assignment_relation_module = import_module("98__r__importArtifactToLiteratureReferenceAssignmentRelation")
artifact_to_mark_information_assignment_relation_module = import_module("98__r__importArtifactToMarkInformationAssignmentRelation")
artifact_to_material_relation_module = import_module("98__r__importArtifactToMaterialRelation")
artifact_to_numerice_date_relation_module = import_module("98__r__importArtifactToNumericeDateRelation")
artifact_to_photograph_relation_module = import_module("98__r__importArtifactToPhotographRelation")
artifact_to_relation_relation_module = import_module("98__r__importArtifactToRelationRelation")
artifact_to_source_relation_module = import_module("98__r__importArtifactToSourceRelation")
artifact_to_status_administrator_relation_module = import_module("98__r__importArtifactToStatusAdministratorRelation")
artist_to_birth_relation_module = import_module("98__r__importArtistToBirthRelation")
artist_to_death_relation_module = import_module("98__r__importArtistToDeathRelation")
artist_to_goldsmith_relation_module = import_module("98__r__importArtistToGoldsmithRelation")
artist_to_literature_reference_relation_module = import_module("98__r__importArtistToLiteratureReferenceRelation")
artist_to_mentioned_relation_module = import_module("98__r__importArtistToMentionedRelation")
artist_to_origin_relation_module = import_module("98__r__importArtistToOriginRelation")
artist_to_workshop_relation_module = import_module("98__r__importArtistToWorkshopRelation")
inspection_mark_dating_information_assignment_relation_module = import_module("98__r__importInspectionMarkDatingInformationAssignmentRelation")
inspection_mark_relation_relation_module = import_module("98__r__importInspectionMarkRelationRelation")
inspection_mark_to_literature_reference_relation_module = import_module("98__r__importInspectionMarkToLiteratureReferenceRelation")
literature_to_journal_relation_module = import_module("98__r__importLiteratureToJournalRelation")
literature_to_parent_publication_relation_module = import_module("98__r__importLiteratureToParentPublicationRelation")
mark_to_dating_relation_module = import_module("98__r__importMarkToDatingRelation")
mark_to_literature_relation_module = import_module("98__r__importMarkToLiteratureRelation")
mark_to_mark_information_relation_module = import_module("98__r__importMarkToMarkInformationRelation")
mark_to_source_relation_module = import_module("98__r__importMarkToSourceRelation")
source_to_date_relation_module = import_module("98__r__importSourceToDateRelation")
source_to_literature_reference_assignment_relation_module = import_module("98__r__importSourceToLiteratureReferenceAssignmentRelation")
# Initialize the database
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilders = ['default']
trials = 0
while trials < 3 :
trials += 1
try:
# Call the function from the imported module
material_module.importMaterialsAndTechnique(api, engine)
administrator_module.importAdministrator(api, engine)
administrator_status_module.importAdministratorStatus(api, engine)
source_module.importSource(api, engine)
artist_source_reference_assignment_module.importArtistSourceReferenceAssignment(api, engine)
marks_module.importMarks(api, engine)
source_reference_assignment_module.importSourceReferenceAssignment(api, engine)
artist_module.importArtist(api, engine)
literature_module.importLiterature(api, engine)
inspection_mark_module.importInspectionMark(api, engine)
journal_assignment_module.importJournalAssignment(api, engine)
literature_reference_assignment_module.importLiteratureReferenceAssignment(api, engine)
parent_literature_assignment_module.importParentLiteratureAssignment(api, engine)
inspection_mark_location_module.importInspectionMarkLocation(api, engine)
inspection_mark_relation_module.importInspectionMarkRelation(api, engine)
mark_dating_info_module.importMarkDatingInfo(api, engine)
birth_module.importBirth(api, engine)
death_module.importDeath(api, engine)
dating_module.importDating(api, engine)
goldsmith_relation_module.importGoldsmithRelation(api, engine)
client_module.importClient(api, engine)
mentioned_module.importMentioned(api, engine)
num_dating_module.importNumDating(api, engine)
origin_assignment_module.importOriginAssignment(api, engine)
workshops_module.importWorkshops(api, engine)
artifacts_module.importArtifacts(api, engine)
artifact_relation_module.importArtifactRelation(api, engine)
artist_assignment_module.importArtistAssignment(api, engine)
mark_information_module.importMarkInformation(api, engine)
photographer_module.importPhotographer(api, engine)
api.pathbuilders = ['relations']
artifact_to_artist_relation_module.importArtifactToArtistRelationRelation(api, engine)
artifact_to_client_assignment_relation_module.importArtifactToClientAssignmentRelation(api, engine)
artifact_to_inspection_mark_location_relation_module.importArtifactToInspectionMarkLocationRelation(api, engine)
artifact_to_literature_reference_assignment_relation_module.importArtifactToLiteratureReferenceAssignmentRelation(api, engine)
artifact_to_mark_information_assignment_relation_module.importArtifactToMarkInformationAssignmentRelation(api, engine)
artifact_to_material_relation_module.importArtifactToMaterialRelation(api, engine)
artifact_to_numerice_date_relation_module.importArtifactToNumericeDateRelation(api, engine)
artifact_to_photograph_relation_module.importArtifactToPhotographRelation(api, engine)
artifact_to_relation_relation_module.importArtifactToRelationRelation(api, engine)
artifact_to_source_relation_module.importArtifactToSourceRelation(api, engine)
artifact_to_status_administrator_relation_module.importArtifactToStatusAdministratorRelation(api, engine)
artist_to_birth_relation_module.importArtistToBirthRelation(api, engine)
artist_to_death_relation_module.importArtistToDeathRelation(api, engine)
artist_to_goldsmith_relation_module.importArtistToGoldsmithRelation(api, engine)
artist_to_literature_reference_relation_module.importArtistToLiteratureReferenceRelation(api, engine)
artist_to_mentioned_relation_module.importArtistToMentionedRelation(api, engine)
artist_to_origin_relation_module.importArtistToOriginRelation(api, engine)
artist_to_workshop_relation_module.importArtistToWorkshopRelation(api, engine)
inspection_mark_dating_information_assignment_relation_module.importInspectionMarkDatingInformationAssignmentRelation(api, engine)
inspection_mark_relation_relation_module.importInspectionMarkRelationRelation(api, engine)
inspection_mark_to_literature_reference_relation_module.importInspectionMarkToLiteratureReferenceRelation(api, engine)
literature_to_journal_relation_module.importLiteratureToJournalRelation(api, engine)
literature_to_parent_publication_relation_module.importLiteratureToParentPublicationRelation(api, engine)
mark_to_dating_relation_module.importMarkToDatingRelation(api, engine)
mark_to_literature_relation_module.importMarkToLiteratureRelation(api, engine)
mark_to_mark_information_relation_module.importMarkToMarkInformationRelation(api, engine)
mark_to_source_relation_module.importMarkToSourceRelation(api, engine)
source_to_date_relation_module.importSourceToDateRelation(api, engine)
source_to_literature_reference_assignment_relation_module.importSourceToLiteratureReferenceAssignmentRelation(api, engine)
except Exception as e:
print(f'Error: {e}')
print(f'Trial {trials} of 3 failed.')
print(f'Retrying in 10 seconds...')
sleep(10)
continue

View file

@ -1,81 +0,0 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
# Initialize the database
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilders = ['default']
try:
processedRows = pd.read_csv(f'./logs/processedMaterials.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load materials table
materialsTable = pd.read_sql_table('c__5280_material', con=engine)
# Create materials
for index, row in materialsTable.iterrows():
# For every row in table...
if index < len(processedRows) and materialsTable.loc[index, 'id'] == processedRows.iloc[index, 'id']:
# skip if already processed
print(f'Skipping already processed material {materialsTable.iloc[index, 0]}')
continue
# Create Entity property dicts
materialValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value):
print('replaced curly braces')
value = str(value).replace('###{{new_line}}###', '')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
match key:
case 'id':
continue
case 'f__uuid':
materialValues['fedfe553c2332bd4902c887813f29ed8'] = value # UUID
case 'f__5280_material':
materialValues['f5f4251312f54c0d104ea87761b94bde'] = value # Material
case 'f__5300_technik':
materialValues['f231e08850022f091ebd5055d8aad30f'] = value # Technique
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
material = Entity(api=api, fields=materialValues, bundle_id='b45978f2b073ff3c73b3c7220ebb3b89')
api.save(material)
print(f'Created material {index}: {material.uri}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': materialValues['fedfe553c2332bd4902c887813f29ed8'][0], 'uri': material.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedMaterials.csv', index=False)
print('finish')

View file

@ -0,0 +1,79 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
def importMaterialsAndTechnique(api, engine):
print('Importing materials and technique...')
tableName = 'c__5280_material'
bundleId = 'b45978f2b073ff3c73b3c7220ebb3b89'
try:
processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load materials table
sqlTable = pd.read_sql_table(tableName, con=engine)
# Create materials
for index, row in sqlTable.iterrows():
# For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed material {sqlTable.loc[index, "id"]}')
continue
# Create Entity property dicts
materialValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# If value is a list of comma-separated strings, split each item by ',' and flatten.
if isinstance(value, list):
new_value = []
for v in value:
if isinstance(v, str) and ',' in v:
new_value.extend([x.strip() for x in v.split(',') if x.strip()])
else:
new_value.append(v)
value = new_value
# Map columns to fields. We use assignments for reification.
match key:
case 'id':
continue
case 'f__uuid':
materialValues['fedfe553c2332bd4902c887813f29ed8'] = value # UUID
case 'f__5280_material':
materialValues['f5f4251312f54c0d104ea87761b94bde'] = value # Material
case 'f__5300_technik':
materialValues['f231e08850022f091ebd5055d8aad30f'] = value # Technique
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
material = Entity(api=api, fields=materialValues, bundle_id=bundleId)
api.save(material)
print(f'Created material {index}: {material.uri} of {len(sqlTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': materialValues['fedfe553c2332bd4902c887813f29ed8'][0], 'uri': material.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish')

View file

@ -5,41 +5,29 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importAdministrator(api, engine):
print('Initializing the database...') print('Importing administrators...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__vwr'
if engine == False: bundleId = 'b4e5a6a31ff575ab09b07b5f27d322ab'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilders = ['default']
try: try:
processedRows = pd.read_csv(f'./logs/processedAdministrators.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['administratorId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
administratorsTable = pd.read_sql_table('c__vwr', con=engine) administratorsTable = pd.read_sql_table(tableName, con=engine)
administratorValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
# Create administrators # Create administrators
for index, row in administratorsTable.iterrows(): for index, row in administratorsTable.iterrows():
administratorValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
# For every row in table... # For every row in table...
if index < len(processedRows) and administratorsTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and administratorsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed administrator {administratorsTable.iloc[index, 0]}') print(f'Skipping already processed administrator {administratorsTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
administratorValues = {} administratorValues = {}
@ -49,10 +37,9 @@ for index, row in administratorsTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{new_line', '') value = str(value).replace(' & ', '&')
value = str(value).replace('}###', '')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -96,14 +83,14 @@ for index, row in administratorsTable.iterrows():
# Set Digitisation Process # Set Digitisation Process
administratorValues['f3ec4640a87bd4534763af0fca050193'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process administratorValues['f3ec4640a87bd4534763af0fca050193'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Material # Create Administrator
administrator = Entity(api=api, fields=administratorValues, bundle_id='b4e5a6a31ff575ab09b07b5f27d322ab') # Administrator administrator = Entity(api=api, fields=administratorValues, bundle_id=bundleId) # Administrator
api.save(administrator) api.save(administrator)
print(f'Created administrator {index}: {administrator.uri}') print(f'Created administrator {index}: {administrator.uri} of {len(administratorsTable)}')
# Write log # Write log
processedRows = processedRows._append({'administratorId': administratorValues['f37e82c36b4fc6b275a1a86a389481e1'][0], 'uuid': administratorValues['f707e595ce7301d61c064e8e44c9c4f4'][0], 'uri': administrator.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': administratorValues['f707e595ce7301d61c064e8e44c9c4f4'][0], 'uri': administrator.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedAdministrators.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing administrators')

View file

@ -5,40 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importAdministratorStatus(api, engine):
print('Initializing the database...') print('Importing administrator statuses...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__ob28_status_verwalt_'
if engine == False: bundleId = 'b45447146729190da3a1d3e19165a6f8'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedAdministratorStatus.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
administratorStatusTable = pd.read_sql_table('c__ob28_status_verwalt_', con=engine) administratorStatusTable = pd.read_sql_table(tableName, con=engine)
administratorStatusValues = {}
# Create administratorStatuss # Create administratorStatuss
for index, row in administratorStatusTable.iterrows(): for index, row in administratorStatusTable.iterrows():
administratorStatusValues = {}
# For every row in table... # For every row in table...
if index < len(processedRows) and administratorStatusTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and administratorStatusTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed administratorStatus {administratorStatusTable.iloc[index, 0]}') print(f'Skipping already processed administratorStatus {administratorStatusTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
administratorStatusValues = {} administratorStatusValues = {}
@ -48,6 +35,9 @@ for index, row in administratorStatusTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -83,6 +73,6 @@ for index, row in administratorStatusTable.iterrows():
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'][0], 'uri': administratorStatus.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': administratorStatusValues['f5ea2a7495ec872781ddc06f862b4270'][0], 'uri': administratorStatus.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedAdministratorStatus.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing administrator statuses')

View file

@ -5,41 +5,29 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importSource(api, engine):
print('Initializing the database...') print('Importing sources...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__que'
if engine == False: bundleId = 'b7dc57a93e008a58514b0d4ca26147b1'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedSources.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id','sourceId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id','sourceId', 'uuid', 'uri'])
# Load sources table # Load sources table
sourcesTable = pd.read_sql_table('c__que', con=engine) sourcesTable = pd.read_sql_table(tableName, con=engine)
sourceValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
# Create sources # Create sources
for index, row in sourcesTable.iterrows(): for index, row in sourcesTable.iterrows():
sourceValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
# For every row in table... # For every row in table...
if index < processedRows['id'].max(): if index < len(processedRows) and sourcesTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed source {row['id']}') print(f"Skipping already processed source {sourcesTable.loc[index, 'id']}")
continue continue
# Create Entity property dicts # Create Entity property dicts
sourceValues = {} sourceValues = {}
@ -49,9 +37,9 @@ for index, row in sourcesTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -116,13 +104,13 @@ for index, row in sourcesTable.iterrows():
sourceValues['ffdf27e75013fa55d31f728ff5166f06'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process sourceValues['ffdf27e75013fa55d31f728ff5166f06'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Material # Create Material
source = Entity(api=api, fields=sourceValues, bundle_id='b7dc57a93e008a58514b0d4ca26147b1') source = Entity(api=api, fields=sourceValues, bundle_id=bundleId)
api.save(source) api.save(source)
print(f'Created source {index}: {source.uri}') print(f'Created source {index}: {source.uri} of {len(sourcesTable)}')
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'sourceId': sourceValues['f50ad6021b42c094f7e551faec831802'][0], 'uuid': sourceValues['f9f02815a5631a85948d4d258a455f49'][0], 'uri': source.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'sourceId': sourceValues['f50ad6021b42c094f7e551faec831802'][0], 'uuid': sourceValues['f9f02815a5631a85948d4d258a455f49'][0], 'uri': source.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedSources.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,44 +5,28 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistSourceReferenceAssignment(api, engine):
print('Initializing the database...') print('Importing artist source reference assignments...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__81kr_que_kt_kue" tableName = "c__81kr_que_kt_kue"
bundleId = 'bf71940d0b18c20511e2141159afb9de' # Artist source reference assignment bundleId = 'bf71940d0b18c20511e2141159afb9de' # Artist source reference assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
entityValues = {}
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -81,7 +68,7 @@ for index, row in sqlTable.iterrows():
print(f'Created entity {index}: {entity.uri} of {len(tableName)}') print(f'Created entity {index}: {entity.uri} of {len(tableName)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,41 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarks(api, engine):
print('Initializing the database...') print('Importing marks...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = 'c__mar'
load_dotenv() bundleId = 'b2c4e1c984d7758d7c7ec719110f7125'
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
# Simple log
try: try:
processedRows = pd.read_csv(f'./logs/processedMarks.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'markId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'markId', 'uuid', 'uri'])
# Load mark table # Load mark table
markTable = pd.read_sql_table('c__mar', con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
print(f'Processing {len(markTable)} marks...') print(f'Processing {len(sqlTable)} marks...')
# Create mark # Create mark
for index, row in markTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < processedRows['id'].max(): if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed mark {row['id']}') print(f'Skipping already processed mark {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
markValues = {} markValues = {}
@ -57,9 +43,9 @@ for index, row in markTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -172,6 +158,7 @@ for index, row in markTable.iterrows():
item = item.replace('Objekte\\', 'objects/') item = item.replace('Objekte\\', 'objects/')
item = item.replace('Objekte3\\', 'objects/') item = item.replace('Objekte3\\', 'objects/')
item = item.replace('Objekte4\\', 'objects/') item = item.replace('Objekte4\\', 'objects/')
item = item.replace('objekte4\\', 'objects/')
item = item.replace('Objekte5\\', 'objects/') item = item.replace('Objekte5\\', 'objects/')
item = item.replace('objekte5\\', 'objects/') item = item.replace('objekte5\\', 'objects/')
item = item.replace('Marken\\', 'marks/') item = item.replace('Marken\\', 'marks/')
@ -260,13 +247,13 @@ for index, row in markTable.iterrows():
markValues['f3baf98f752fc9638de175985183119a'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process markValues['f3baf98f752fc9638de175985183119a'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Mark # Create Mark
mark = Entity(api=api, fields=markValues, bundle_id='b2c4e1c984d7758d7c7ec719110f7125') mark = Entity(api=api, fields=markValues, bundle_id=bundleId)
api.save(mark) api.save(mark)
print(f'Created mark number {index}: {mark.uri} of {len(markTable)}') print(f'Created mark number {index}: {mark.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'markId': markValues['fe577970c02f173170ff3848a36b3b79'][0], 'uuid': markValues['fb40b199b4032e55acc152f994e93b45'][0], 'uri': mark.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'markId': markValues['fe577970c02f173170ff3848a36b3b79'][0], 'uuid': markValues['fb40b199b4032e55acc152f994e93b45'][0], 'uri': mark.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedMarks.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing marks')

View file

@ -5,44 +5,28 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importSourceReferenceAssignment(api, engine):
print('Initializing the database...') print('Importing source reference assignments...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__8130_que_kurzt_" tableName = "c__8130_que_kurzt_"
bundleId = 'b3c4232e84c2f39795bd602f152ed6f0' # Source reference assignment bundleId = 'b3c4232e84c2f39795bd602f152ed6f0' # Source reference assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -78,9 +65,9 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(tableName)}') print(f'Created source reference assignment {index}: {entity.uri} of {len(tableName)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing source reference assignments')

View file

@ -5,57 +5,44 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtist(api, engine):
print('Initializing the database...') print('Importing artists...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = 'c__kue'
load_dotenv() bundleId = 'bc322be33491dacc600dd43fdee09a5c'
# Initialize the WissKI API test = False
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = True
try: try:
processedRows = pd.read_csv(f'./logs/processedArtists.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['artistId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
artistsTable = pd.read_sql_table('c__kue', con=engine) artistsTable = pd.read_sql_table(tableName, con=engine)
# Create artists
for index, row in artistsTable.iterrows():
# For every row in table...
if index < len(processedRows) and artistsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed artist {artistsTable.loc[index, "id"]}')
continue
# Create Entity property dicts
artistValues = {} artistValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]} digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
imageValues = {} imageValues = {}
reproNumberAssignmentValues = {'fac4426c096e7f8f44bb0e11b8394952': [str(uuid.uuid4())]} reproNumberAssignmentValues = {'fac4426c096e7f8f44bb0e11b8394952': [str(uuid.uuid4())]}
# Create artists
for index, row in artistsTable.iterrows():
# For every row in table...
if index < len(processedRows) and artistsTable.loc[index, 'f__3000_kue_dok_nr_'] == processedRows.loc[index, 'artistId']:
# skip if already processed
print(f'Skipping already processed artist {artistsTable.loc[index, "f__3000_kue_dok_nr_"]}')
continue
# Create Entity property dicts
artistValues = {}
for key, value in row.items(): for key, value in row.items():
# For every column in row... # For every column in row...
if (value is None) or (value == ''): if (value is None) or (value == ''):
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -135,7 +122,7 @@ for index, row in artistsTable.iterrows():
if value: if value:
imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3') imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3')
api.save(imageItem) api.save(imageItem)
imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0]) # add UUID to list
# Create Image Assignment entities and add their UUIDs to a list # Create Image Assignment entities and add their UUIDs to a list
# because we link Artist and Image Assignment over the UUID # because we link Artist and Image Assignment over the UUID
@ -143,27 +130,29 @@ for index, row in artistsTable.iterrows():
reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = imageList # List of Image UUIDs reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = imageList # List of Image UUIDs
reproNumberAssignment = Entity(api=api, fields=reproNumberAssignmentValues, bundle_id='bdc233b242374a41b5e6923eee937fe9') reproNumberAssignment = Entity(api=api, fields=reproNumberAssignmentValues, bundle_id='bdc233b242374a41b5e6923eee937fe9')
api.save(reproNumberAssignment) api.save(reproNumberAssignment)
else:
reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'] = []
if reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44'][0]: if reproNumberAssignmentValues['f2cd4ece6e60bf288b9ae769af08bc44']:
artistValues['f42deb039d8d4f47877892af005a1ef9'] = [reproNumberAssignmentValues['fac4426c096e7f8f44bb0e11b8394952'][0]] # Image Assignment artistValues['f42deb039d8d4f47877892af005a1ef9'] = [reproNumberAssignmentValues['fac4426c096e7f8f44bb0e11b8394952'][0]] # Image Assignment
if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]: if digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]:
artistValues['f6c2b79f1ba142bb62f83b2c4d805e49'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process artistValues['f6c2b79f1ba142bb62f83b2c4d805e49'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Material # Create Material
artist = Entity(api=api, fields=artistValues, bundle_id='bc322be33491dacc600dd43fdee09a5c') artist = Entity(api=api, fields=artistValues, bundle_id=bundleId)
api.save(artist) api.save(artist)
print(f'Created artist {index}: {artist.uri} of {len(artistsTable)}') print(f'Created artist {index}: {artist.uri} of {len(artistsTable)}')
# Write log # Write log
processedRows = processedRows._append({'artistId': artistValues['f61deac361ac5e0731edbf214761d15c'][0], 'uuid': artistValues['fff2eb2283e4cd8df3783602a1bc96ab'][0], 'uri': artist.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': artistValues['fff2eb2283e4cd8df3783602a1bc96ab'][0], 'uri': artist.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedArtists.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
print('Testing mode activated. Exiting.') print('Testing mode activated. Exiting.')
exit() exit()
print('finish') print('finished importing artists')

View file

@ -5,53 +5,39 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importLiterature(api, engine):
print('Initializing the database...') print('Importing literature...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__lit'
if engine == False: bundleId = 'bafe9c3d3b640d4d1a16b104f367ac91'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedLiteratures.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'literatureId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri'])
# Load sources table # Load sources table
literaturesTable = pd.read_sql_table('c__lit', con=engine) literaturesTable = pd.read_sql_table(tableName, con=engine)
literatureValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
# Create literatures # Create literatures
for index, row in literaturesTable.iterrows(): for index, row in literaturesTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and literaturesTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and literaturesTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed literature {literaturesTable.iloc[index, 0]}') print(f'Skipping already processed literature {literaturesTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
literatureValues = {} literatureValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
for key, value in row.items(): for key, value in row.items():
# For every column in row... # For every column in row...
if (value is None) or (value == ''): if (value is None) or (value == ''):
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -120,7 +106,7 @@ for index, row in literaturesTable.iterrows():
print(f'Created literature {index}: {literature.uri} of {len(literaturesTable)}') print(f'Created literature {index}: {literature.uri} of {len(literaturesTable)}')
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'literatureId': literatureValues['f3bdd54b9ea5808a571200e9c60e103e'][0], 'uuid': literatureValues['fd58e0884f7cf63f8436c2789fcd2745'][0], 'uri': literature.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'docId': literatureValues['f3bdd54b9ea5808a571200e9c60e103e'][0], 'uuid': literatureValues['fd58e0884f7cf63f8436c2789fcd2745'][0], 'uri': literature.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedLiteratures.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,30 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMark(api, engine):
print('Initializing the database...') print('Importing inspection marks...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__bez'
if engine == False: bundleId = 'baad021dfda9b89d5ba407dd0fca0d03'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
# Simple log
try: try:
processedRows = pd.read_csv(f'./logs/processedInspectionMarks.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'inspectionMarkId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri'])
# Load inspectionMark table # Load inspectionMark table
inspectionMarkTable = pd.read_sql_table('c__bez', con=engine) inspectionMarkTable = pd.read_sql_table('c__bez', con=engine)
@ -56,10 +41,9 @@ for index, row in inspectionMarkTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{new_line', '') value = str(value).replace(' & ', '&')
value = str(value).replace('}###', '')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -114,6 +98,7 @@ for index, row in inspectionMarkTable.iterrows():
if item is not None: if item is not None:
# Replace dir paths in name # Replace dir paths in name
item = item.replace('Objekte\\', 'objects/') item = item.replace('Objekte\\', 'objects/')
item = item.replace('Objekte/', 'objects/')
item = item.replace('Objekte3\\', 'objects/') item = item.replace('Objekte3\\', 'objects/')
item = item.replace('Objekte4\\', 'objects/') item = item.replace('Objekte4\\', 'objects/')
item = item.replace('Objekte5\\', 'objects/') item = item.replace('Objekte5\\', 'objects/')
@ -189,13 +174,13 @@ for index, row in inspectionMarkTable.iterrows():
inspectionMarkValues['f998036ccd7daaf2d9938934c93938f3'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process inspectionMarkValues['f998036ccd7daaf2d9938934c93938f3'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Mark # Create Mark
inspectionMark = Entity(api=api, fields=inspectionMarkValues, bundle_id='baad021dfda9b89d5ba407dd0fca0d03') inspectionMark = Entity(api=api, fields=inspectionMarkValues, bundle_id=bundleId)
api.save(inspectionMark) api.save(inspectionMark)
print(f'Created inspectionMark number {index}: {inspectionMark.uri} of {len(inspectionMarkTable)}') print(f'Created inspectionMark number {index}: {inspectionMark.uri} of {len(inspectionMarkTable)}')
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'inspectionMarkId': inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'][0], 'uuid': inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'][0], 'uri': inspectionMark.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'docId': inspectionMarkValues['fcdb19d95832ac030d353b5ba92796b7'][0], 'uuid': inspectionMarkValues['fb125fa322fe7c3c98446e382b1f22b9'][0], 'uri': inspectionMark.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedInspectionMarks.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing inspection marks')

View file

@ -5,44 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importJournalAssignment(api, engine):
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
tableName = "c__8310_zeitschrift" tableName = "c__8310_zeitschrift"
bundleId = 'b5508ef3bb28f139ebdd9f6d545825c4' bundleId = 'b5508ef3bb28f139ebdd9f6d545825c4'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -77,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created journal assignment {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importLiteratureReferenceAssignment(api, engine):
print('Initializing the database...') print('Importing literature reference assignments...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__8330_lit_kurzt_" tableName = "c__8330_lit_kurzt_"
bundleId = 'bdda154adecb26deed2d8b67dab8a0db' # Literature Reference Assignment bundleId = 'bdda154adecb26deed2d8b67dab8a0db' # Literature Reference Assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -79,10 +64,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created literature reference assignment {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importParentLiteratureAssignment(api, engine):
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
tableName = "c__8292_uebergeordn_publ_" tableName = "c__8292_uebergeordn_publ_"
bundleId = 'bf55dda81ca0ddb4237a0d3ea495579b' # Parent literature assignment bundleId = 'bf55dda81ca0ddb4237a0d3ea495579b' # Parent literature assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -77,11 +62,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created parent literature assignment {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()

View file

@ -5,40 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMarkLocation(api, engine):
print('Initializing the database...') print('Importing inspection mark locations...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = 'c__67b0_bz_dok_nr'
load_dotenv() bundleId = 'b4158ec3a326d8ab504062296a82f13a'
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedInspectionMarkLocation.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
inspectionMarkLocationsTable = pd.read_sql_table('c__67b0_bz_dok_nr', con=engine) inspectionMarkLocationsTable = pd.read_sql_table(tableName, con=engine)
inspectionMarkLocationValues = {}
# Create inspectionMarkLocations # Create inspectionMarkLocations
for index, row in inspectionMarkLocationsTable.iterrows(): for index, row in inspectionMarkLocationsTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and inspectionMarkLocationsTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and inspectionMarkLocationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed inspectionMarkLocation {inspectionMarkLocationsTable.iloc[index, 0]}') print(f'Skipping already processed inspectionMarkLocation {inspectionMarkLocationsTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
inspectionMarkLocationValues = {} inspectionMarkLocationValues = {}
@ -48,10 +34,9 @@ for index, row in inspectionMarkLocationsTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{new_line', '') value = str(value).replace(' & ', '&')
value = str(value).replace('}###', '')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -73,13 +58,13 @@ for index, row in inspectionMarkLocationsTable.iterrows():
print(f'{key} is not a valid field, skipping.') print(f'{key} is not a valid field, skipping.')
# Create Material # Create Material
inspectionMarkLocation = Entity(api=api, fields=inspectionMarkLocationValues, bundle_id='b4158ec3a326d8ab504062296a82f13a') inspectionMarkLocation = Entity(api=api, fields=inspectionMarkLocationValues, bundle_id=bundleId)
api.save(inspectionMarkLocation) api.save(inspectionMarkLocation)
print(f'Created inspectionMarkLocation {index}: {inspectionMarkLocation.uri}') print(f'Created inspectionMarkLocation {index}: {inspectionMarkLocation.uri}')
# Write log # Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'][0], 'uri': inspectionMarkLocation.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkLocationValues['f65178b07306225efb0b556f6e4f54a5'][0], 'uri': inspectionMarkLocation.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedInspectionMarkLocation.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing inspection mark locations')

View file

@ -5,40 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMarkRelation(api, engine):
print('Initializing the database...') print('Importing inspection mark relations...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = 'c__67b7_beziehung'
load_dotenv() bundleId = 'bd9b0ff8dc3a6d9284e1798531389bf1'
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedInspectionMarkRelation.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
inspectionMarkRelationsTable = pd.read_sql_table('c__67b7_beziehung', con=engine) inspectionMarkRelationsTable = pd.read_sql_table(tableName, con=engine)
inspectionMarkRelationValues = {}
# Create inspectionMarkRelations # Create inspectionMarkRelations
for index, row in inspectionMarkRelationsTable.iterrows(): for index, row in inspectionMarkRelationsTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and inspectionMarkRelationsTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and inspectionMarkRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed inspectionMarkRelation {inspectionMarkRelationsTable.iloc[index, 0]}') print(f'Skipping already processed inspectionMarkRelation {inspectionMarkRelationsTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
inspectionMarkRelationValues = {} inspectionMarkRelationValues = {}
@ -48,9 +34,9 @@ for index, row in inspectionMarkRelationsTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,10 +61,10 @@ for index, row in inspectionMarkRelationsTable.iterrows():
inspectionMarkRelation = Entity(api=api, fields=inspectionMarkRelationValues, bundle_id='bd9b0ff8dc3a6d9284e1798531389bf1') inspectionMarkRelation = Entity(api=api, fields=inspectionMarkRelationValues, bundle_id='bd9b0ff8dc3a6d9284e1798531389bf1')
api.save(inspectionMarkRelation) api.save(inspectionMarkRelation)
print(f'Created inspectionMarkRelation {index}: {inspectionMarkRelation.uri}') print(f'Created inspection mark relation {index}: {inspectionMarkRelation.uri}')
# Write log # Write log
processedRows = processedRows._append({'uuid': inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'][0], 'uri': inspectionMarkRelation.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': inspectionMarkRelationValues['ffd502413c286815811ae5546f73935b'][0], 'uri': inspectionMarkRelation.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedInspectionMarkRelation.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,40 +5,25 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarkDatingInfo(api, engine):
print('Initializing the database...') print('Importing mark dating info...')
engine, metadata, Session = initDb(True, './schemas/') tableName = 'c__68dm_datierung_marke'
if engine == False: bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64'
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedDatingInfo.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
datingInfosTable = pd.read_sql_table('c__68dm_datierung_marke', con=engine) datingInfosTable = pd.read_sql_table(tableName, con=engine)
datingInfoValues = {}
# Create datingInfos # Create datingInfos
for index, row in datingInfosTable.iterrows(): for index, row in datingInfosTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and datingInfosTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and datingInfosTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed datingInfo {datingInfosTable.iloc[index, 0]}') print(f'Skipping already processed datingInfo {datingInfosTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
datingInfoValues = {} datingInfoValues = {}
@ -48,9 +33,9 @@ for index, row in datingInfosTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
if '###{{new_line}}###' in str(value): value = str(value).replace('&###{{new_line}}###'.format(), '&')
print('replaced curly braces') value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace('###{{new_line}}###', '') value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,10 +60,10 @@ for index, row in datingInfosTable.iterrows():
datingInfo = Entity(api=api, fields=datingInfoValues, bundle_id='b9cfb95e627e1710cf8d736d4ca5db64') #Dating Information Assignment datingInfo = Entity(api=api, fields=datingInfoValues, bundle_id='b9cfb95e627e1710cf8d736d4ca5db64') #Dating Information Assignment
api.save(datingInfo) api.save(datingInfo)
print(f'Created datingInfo {index}: {datingInfo.uri} of {len(datingInfosTable)}') print(f'Created mark dating info {index}: {datingInfo.uri} of {len(datingInfosTable)}')
# Write log # Write log
processedRows = processedRows._append({'uuid': datingInfoValues['f74baaf58e49393cc89d6616ee197901'][0], 'uri': datingInfo.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': datingInfoValues['f74baaf58e49393cc89d6616ee197901'][0], 'uri': datingInfo.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedDatingInfo.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing mark dating info')

View file

@ -1,97 +0,0 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
# Initialize the database
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__6760_markenart"
bundleId = 'bc7ce6906f78e760f22ff13226b1332d' # Mark information assignment
try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri'])
# Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities
for index, row in sqlTable.iterrows():
# For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]:
# skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}')
continue
# Create Entity property dicts
entityValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
docId = ''
match key:
case 'id':
continue
case 'f__uuid':
entityValues['f3b8aaf7e79229b4da8214d491e375ec'] = value # UUID
fUuid = value[0]
case 'f__5064_num__dat_':
entityValues['fe6921098808e68cae68f0858411826c'] = value # Artist Assignment
case 'f__6894_anbr_ort':
entityValues['f694ed57271ab7be57249e0ee5c41ba4'] = value # Location
case 'f__6700_mar_dok_nr_':
entityValues['fdd3380d4a11654f32687429796cabc3'] = value # Mark Document Number
case 'f__6760_markenart':
entityValues['fd381aa9c3ebdf417e6cbccd60ede279'] = value # Mark Type
case 'f__684c_bedeutung_bz':
entityValues['f4947de52885f517baef0cdf3cb53b61'] = value # Meaning Inspection Mark
case 'f__684a_bedeutung_mz':
entityValues['f542c4c945725c6fdc5ab6409a877f02'] = value # Meaning Master Mark
case 'f__6770_rosenb_nr_':
entityValues['f0ff7020a9c25ea2706875837fe61b04'] = value # Rosenberg Number
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False)
print('finish')

View file

@ -5,44 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importBirth(api, engine):
print('Initializing the database...') print('Importing birth...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
tableName = "c__3270_geb_datum" tableName = "c__3270_geb_datum"
bundleId = 'b54049ec931bffb62359b4bdb11435fc' bundleId = 'b54049ec931bffb62359b4bdb11435fc'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -87,12 +73,12 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created birth {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing birth')

View file

@ -5,45 +5,28 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importDeath(api, engine):
print('Initializing the database...') print('Importing death...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables test = False
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = True
tableName = "c__3330_todes_dat_" tableName = "c__3330_todes_dat_"
bundleId = 'b487c08016f572b9ecf3f9173339fec3' bundleId = 'b487c08016f572b9ecf3f9173339fec3'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -90,11 +76,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created death {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
break break

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importDating(api, engine):
print('Initializing the database...') print('Importing dating...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__8100_datum" tableName = "c__8100_datum"
bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -65,7 +50,7 @@ for index, row in sqlTable.iterrows():
docId = value[0] docId = value[0]
case 'f__uuid': case 'f__uuid':
entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID
uuid = value[0] fUuid = value[0]
case 'f__8100_datum': case 'f__8100_datum':
entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date
case 'f__81bm_bem__datierung': case 'f__81bm_bem__datierung':
@ -77,10 +62,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created dating {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,23 +5,8 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importGoldsmithRelation(api, engine):
print('Initializing the database...') print('Importing goldsmith relation...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
@ -29,21 +14,19 @@ tableName = "c__3007_bezieh__zu_gs"
bundleId = 'bef43e8a958e6a9bee04534b3841f6a0' bundleId = 'bef43e8a958e6a9bee04534b3841f6a0'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -82,12 +68,12 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created goldsmith relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing goldsmith relation')

View file

@ -5,44 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importClient(api, engine):
print('Initializing the database...') print('Importing client...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__410a_auftraggeber" tableName = "c__410a_auftraggeber"
bundleId = 'b85d9987d762fb4e8ce89a69b0b8de31' bundleId = 'b85d9987d762fb4e8ce89a69b0b8de31'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -81,10 +67,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(tableName)}') print(f'Created client {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,44 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMentioned(api, engine):
print('Initializing the database...') print('Importing mentioned...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
tableName = "c__7060_erwaehnt__datum_" tableName = "c__7060_erwaehnt__datum_"
bundleId = 'b04b1756b09ba3260de278824332ad6c' bundleId = 'b04b1756b09ba3260de278824332ad6c'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -93,11 +79,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(tableName)}') print(f'Created mentioned {index}: {entity.uri} of {len(tableName)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importNumDating(api, engine):
print('Initializing the database...') print('Importing num dating...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__5064_num__dat_" tableName = "c__5064_num__dat_"
bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' # Dating Information Assignment bundleId = 'b9cfb95e627e1710cf8d736d4ca5db64' # Dating Information Assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.iloc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -65,7 +50,7 @@ for index, row in sqlTable.iterrows():
docId = value[0] docId = value[0]
case 'f__uuid': case 'f__uuid':
entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID entityValues['f74baaf58e49393cc89d6616ee197901'] = value # UUID
uuid = value[0] fUuid = value[0]
case 'f__5064_num__dat_': case 'f__5064_num__dat_':
entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date entityValues['f0da3b36d16e16602bb550aff7d36297'] = value # Date
case 'f__50bm_bem__datierung': case 'f__50bm_bem__datierung':
@ -77,10 +62,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(tableName)}') print(f'Created num dating {index}: {entity.uri} of {len(tableName)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finish')

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importOriginAssignment(api, engine):
print('Initializing the database...') print('Importing origin assignment...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
test = False test = False
tableName = "c__3204_herkunft" tableName = "c__3204_herkunft"
bundleId = 'b1d5be81f8b3dfbf9d6d90379cc0a14f' bundleId = 'b1d5be81f8b3dfbf9d6d90379cc0a14f'
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.iloc[index, 0]}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -77,13 +62,13 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created origin assignment {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing origin assignments')

View file

@ -5,41 +5,27 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importWorkshops(api, engine):
print('Initializing the database...') print('Importing workshops...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = "c__nfws_forts_werkst_"
load_dotenv() bundleId = 'beb03bccbdffdd31567df370303c1e2d'
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedWorkshops.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
test = False test = False
# Load sources table # Load sources table
workshopsTable = pd.read_sql_table('c__nfws_forts_werkst_', con=engine) workshopsTable = pd.read_sql_table(tableName, con=engine)
workshopValues = {}
# Create workshops # Create workshops
for index, row in workshopsTable.iterrows(): for index, row in workshopsTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and workshopsTable.iloc[index, 0] == processedRows.iloc[index, 0]: if index < len(processedRows) and workshopsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed workshop {workshopsTable.iloc[index, 0]}') print(f'Skipping already processed entity {workshopsTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
workshopValues = {} workshopValues = {}
@ -49,6 +35,9 @@ for index, row in workshopsTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -71,16 +60,16 @@ for index, row in workshopsTable.iterrows():
print(f'{key} is not a valid field, skipping.') print(f'{key} is not a valid field, skipping.')
# Create Material # Create Material
workshop = Entity(api=api, fields=workshopValues, bundle_id='beb03bccbdffdd31567df370303c1e2d') workshop = Entity(api=api, fields=workshopValues, bundle_id=bundleId)
api.save(workshop) api.save(workshop)
print(f'Created workshop {index}: {workshop.uri} of {len(workshopsTable)}') print(f'Created workshop {index}: {workshop.uri} of {len(workshopsTable)}')
# Write log # Write log
processedRows = processedRows._append({'uuid': workshopValues['fa7c19f4d03d7d15acf588460654bbf2'][0], 'uri': workshop.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': workshopValues['fa7c19f4d03d7d15acf588460654bbf2'][0], 'uri': workshop.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedWorkshops.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing workshops')

207
21_importArtifacts.py Normal file
View file

@ -0,0 +1,207 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
def importArtifacts(api, engine):
print('Importing artifacts...')
tableName = "c__obj"
bundleId = 'bd30c2c64a3caa8bb1628c780c3f24bb'
try:
processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load artifacts table
artifactsTable = pd.read_sql_table(tableName, con=engine)
# Create artifacts
for index, row in artifactsTable.iterrows():
# For every row in table...
if index < len(processedRows) and artifactsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed artifact {artifactsTable.loc[index, "id"]}')
continue
# Create Entity property dicts
artifactValues = {}
creationValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
imageValues = {}
imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]}
productionPlaceAssignmentValues = {'f40cc95db3ccaa1dbbf27294338d9f07': [str(uuid.uuid4())]}
dimensionValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
match key:
case 'id':
continue
case 'f__uuid':
artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'] = value # UUID
case 'f__5000_obj_dok_nr_':
artifactValues['f7e2a8a273ab3d577bf5854902550c09'] = value # Document Identifier
docId = value[0]
case 'f__500n_ngk_nr_':
artifactValues['f6e041bd0b16b21596849732c01cb168'] = value # NGK Number
case 'f__5130_entst_ort':
# We map productions place to Production Place Assignment entity.
productionPlaceAssignmentValues['f43f9589eef324fb12c26226dfe94246'] = value # Production Place
case 'f__5200_obj_titel':
artifactValues['fd06dcc49a29b1a63fa4a789ec17e5c6'] = value # Title
case 'f__5210_status':
artifactValues['f35c9c9b0991729c36acb41645fe81d1'] = value # Status
case 'f__5220_gattung':
artifactValues['f2fd7f8a81d5eb1a20371b9acfd1ab59'] = value # Genre
case 'f__5223_form__attribut':
artifactValues['f05bbd6e29a7d303e4370b04c12b3f75'] = value # Formattribute
case 'f__5226_art':
artifactValues['f593fa773a6ea458101ba2325a18abbe'] = value # artifact type
case 'f__523f_funktion':
artifactValues['f476ba24127d4dff1018acebf45a05f6'] = value # Function
case 'f__5240_formtyp':
artifactValues['fa7cfd9dbb3d2517c1898b3051d8dbed'] = value # Shape
case 'f__524g_gestalt':
artifactValues['f8309a21fa79bc6bd2506060b419d2df'] = value # Figure
case 'f__5362_hoehe':
# We map dimensions to Dimension entity.
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['height'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5364_breite':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5366_tiefe':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['depth'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5368_laenge':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['length'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5370_durchmesser':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['diameter'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5380_gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__538h_hist__gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['historical_weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__55ng_darst__schlagw_':
artifactValues['f6abbd4f39a6f79de5de2b14b98e51ff'] = value # Keywords
case 'f__5bes_beschreibung':
artifactValues['f26ad2bc1f084478cd7011f7b8451526'] = value # Description
case 'f__5ges_geschichte':
artifactValues['f40120d7c13ef02b486c69245f6c2306'] = value # History
case 'f__68an_abdruck_nr_':
artifactValues['fd3740649cc06f45677eb0546908cdac'] = value # Print Number
case 'f__8540_repro_nr_':
# We map images to Image entity
for item in value:
if item is not None:
# Replace dir paths in name
item = item.replace('Objekte/', 'objects/')
item = item.replace('Objekte\\', 'objects/')
item = item.replace('Objekte3\\', 'objects/')
item = item.replace('Objekte4\\', 'objects/')
item = item.replace('objekte4\\', 'objects/')
item = item.replace('Objekte5\\', 'objects/')
item = item.replace('objekte5\\', 'objects/')
item = item.replace('Marken\\', 'marks/')
item = item.replace('Marken/', 'marks/')
imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image)
imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File
imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID
case 'f__stwv_statwerkverz':
artifactValues['fee0db94d62fae6370a89ff4757ff539'] = value # Catalogue_of_Works
case 'f__9990_kommentar':
artifactValues['fefe289aa0c9563a153be6da7d37e3ff'] = value # Comment
case 'f__9900_datum_erfassung':
digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date
case 'f__99ae_datum_aenderung':
digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date
case 'f__efbm_bem_erfassung':
digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note
case 'f__ptxt_plug_in_text':
artifactValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text
case _:
print(f'{key} is not a valid field, skipping.')
# Create Production Place Assignment
productionPlaceAssignment = Entity(api=api, fields=productionPlaceAssignmentValues, bundle_id='b13bc6dc04d4bbdafb9536987eb43244')
api.save(productionPlaceAssignment) # Kai says, we can save all entities at once, but I save it instantly
# Create Dimension entities and add their UUIDs to a list
# because we link Artifact and Dimension over the UUID
dimension = []
for key, value in dimensionValues.items():
dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558')
api.save(dimensionItem)
dimension.append(value['f802fd7bf45be523a9b188411a591420'][0])
# Create Image entities and add their UUIDs to a list
# because we link Image Assignment and Image over the UUID
imageList = []
for key, value in imageValues.items():
imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3')
api.save(imageItem)
imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0])
# Create Image Assignment entities and add their UUIDs to a list
# because we link Artifact and Image Assignment over the UUID
if imageList:
imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs
imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c')
api.save(imageAssignment)
# Create Digitisation Process
digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b')
api.save(digitisationProcess)
# Add the field values for reference
# UWAGA! Is the Value Production Place Assignment Correct? UWAGA!
artifactValues['f2676a0fb8db6ab62235328ae7c7a4b3'] = [productionPlaceAssignmentValues['f40cc95db3ccaa1dbbf27294338d9f07'][0]] # Production Place Assignment
artifactValues['fc700eb3f24f4f2a6c165128aa7117f1'] = dimension # Dimension
artifactValues['f7af1cd9c77448281dd7ecf29ba57e3e'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment
artifactValues['f5a3f90d920da3db4cfdbaa6264b0e89'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Artifact
artifact = Entity(api=api, fields=artifactValues, bundle_id=bundleId)
api.save(artifact)
print(f'Created artifact {index}: {artifact.uri} of {len(artifactsTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'][0], 'uri': artifact.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finished importing artifacts')

View file

@ -1,213 +0,0 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
# Initialize the database
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try:
processedRows = pd.read_csv(f'./logs/processedArtifacts.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['artifactId', 'uuid', 'uri'])
# Load artifacts table
artifactsTable = pd.read_sql_table('c__obj', con=engine)
# Create artifacts
for index, row in artifactsTable.iterrows():
# For every row in table...
if index < len(processedRows) and artifactsTable.iloc[index, 0] == processedRows.iloc[index, 0]:
# skip if already processed
print(f'Skipping already processed artifact {artifactsTable.iloc[index, 0]}')
continue
# Create Entity property dicts
artifactValues = {}
creationValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
imageValues = {}
imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]}
productionPlaceAssignmentValues = {'f40cc95db3ccaa1dbbf27294338d9f07': [str(uuid.uuid4())]}
dimensionValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
match key:
case 'id':
continue
case 'f__uuid':
artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'] = value # UUID
case 'f__5000_obj_dok_nr_':
artifactValues['f7e2a8a273ab3d577bf5854902550c09'] = value # Document Identifier
docId = value[0]
case 'f__500n_ngk_nr_':
artifactValues['f6e041bd0b16b21596849732c01cb168'] = value # NGK Number
case 'f__5130_entst_ort':
# We map productions place to Production Place Assignment entity.
productionPlaceAssignmentValues['f43f9589eef324fb12c26226dfe94246'] = value # Production Place
case 'f__5200_obj_titel':
artifactValues['fd06dcc49a29b1a63fa4a789ec17e5c6'] = value # Title
case 'f__5210_status':
artifactValues['f35c9c9b0991729c36acb41645fe81d1'] = value # Status
case 'f__5220_gattung':
artifactValues['f2fd7f8a81d5eb1a20371b9acfd1ab59'] = value # Genre
case 'f__5223_form__attribut':
artifactValues['f05bbd6e29a7d303e4370b04c12b3f75'] = value # Formattribute
case 'f__5226_art':
artifactValues['f593fa773a6ea458101ba2325a18abbe'] = value # artifact type
case 'f__523f_funktion':
artifactValues['f476ba24127d4dff1018acebf45a05f6'] = value # Function
case 'f__5240_formtyp':
artifactValues['fa7cfd9dbb3d2517c1898b3051d8dbed'] = value # Shape
case 'f__524g_gestalt':
artifactValues['f8309a21fa79bc6bd2506060b419d2df'] = value # Figure
case 'f__5362_hoehe':
# We map dimensions to Dimension entity.
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['height'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5364_breite':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5366_tiefe':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['depth'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5368_laenge':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['length'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5370_durchmesser':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['diameter'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5380_gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__538h_hist__gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['historical_weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__55ng_darst__schlagw_':
artifactValues['f6abbd4f39a6f79de5de2b14b98e51ff'] = value # Keywords
case 'f__5bes_beschreibung':
artifactValues['f26ad2bc1f084478cd7011f7b8451526'] = value # Description
case 'f__5ges_geschichte':
artifactValues['f40120d7c13ef02b486c69245f6c2306'] = value # History
case 'f__68an_abdruck_nr_':
artifactValues['fd3740649cc06f45677eb0546908cdac'] = value # Print Number
case 'f__8540_repro_nr_':
# We map images to Image entity
for item in value:
if item is not None:
# Replace dir paths in name
item = item.replace('Objekte/', 'objects/')
item = item.replace('Objekte\\', 'objects/')
item = item.replace('Objekte3\\', 'objects/')
item = item.replace('Objekte4\\', 'objects/')
item = item.replace('Objekte5\\', 'objects/')
item = item.replace('objekte5\\', 'objects/')
item = item.replace('Marken\\', 'marks/')
item = item.replace('Marken/', 'marks/')
imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image)
imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File
imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID
case 'f__stwv_statwerkverz':
artifactValues['fee0db94d62fae6370a89ff4757ff539'] = value # Catalogue_of_Works
case 'f__9990_kommentar':
artifactValues['fefe289aa0c9563a153be6da7d37e3ff'] = value # Comment
case 'f__9900_datum_erfassung':
digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date
case 'f__99ae_datum_aenderung':
digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date
case 'f__efbm_bem_erfassung':
digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note
case 'f__ptxt_plug_in_text':
artifactValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text
case _:
print(f'{key} is not a valid field, skipping.')
# Create Production Place Assignment
productionPlaceAssignment = Entity(api=api, fields=productionPlaceAssignmentValues, bundle_id='b13bc6dc04d4bbdafb9536987eb43244')
api.save(productionPlaceAssignment) # Kai says, we can save all entities at once, but I save it instantly
# Create Dimension entities and add their UUIDs to a list
# because we link Artifact and Dimension over the UUID
dimension = []
for key, value in dimensionValues.items():
dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558')
api.save(dimensionItem)
dimension.append(value['f802fd7bf45be523a9b188411a591420'][0])
# Create Image entities and add their UUIDs to a list
# because we link Image Assignment and Image over the UUID
imageList = []
for key, value in imageValues.items():
imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3')
api.save(imageItem)
imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0])
# Create Image Assignment entities and add their UUIDs to a list
# because we link Artifact and Image Assignment over the UUID
if imageList:
imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs
imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c')
api.save(imageAssignment)
# Create Digitisation Process
digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b')
api.save(digitisationProcess)
# Add the field values for reference
# UWAGA! Is the Value Production Place Assignment Correct? UWAGA!
artifactValues['f2676a0fb8db6ab62235328ae7c7a4b3'] = [productionPlaceAssignmentValues['f40cc95db3ccaa1dbbf27294338d9f07'][0]] # Production Place Assignment
artifactValues['fc700eb3f24f4f2a6c165128aa7117f1'] = dimension # Dimension
artifactValues['f7af1cd9c77448281dd7ecf29ba57e3e'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment
artifactValues['f5a3f90d920da3db4cfdbaa6264b0e89'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Artifact
artifact = Entity(api=api, fields=artifactValues, bundle_id='bd30c2c64a3caa8bb1628c780c3f24bb')
api.save(artifact)
print(f'Created artifact {index}: {artifact.uri} of {len(artifactsTable)}')
# Write log
processedRows = processedRows._append({'artifactId': artifactValues['f7e2a8a273ab3d577bf5854902550c09'][0], 'uuid': artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'][0], 'uri': artifact.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedArtifacts.csv', index=False)
print('finish')

View file

@ -5,44 +5,26 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactRelation(api, engine):
print('Initializing the database...') print('Importing artifact relation...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__5007_beziehung" tableName = "c__5007_beziehung"
bundleId = 'bf4a13ee46de57819f88834caaddc301' # Artifact relation assignment bundleId = 'bf4a13ee46de57819f88834caaddc301' # Artifact relation assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.ioc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed artifact relation {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -65,7 +50,7 @@ for index, row in sqlTable.iterrows():
docId = value[0] docId = value[0]
case 'f__uuid': case 'f__uuid':
entityValues['ff7ebd530eb53efc489e80d9bbef293e'] = value # UUID entityValues['ff7ebd530eb53efc489e80d9bbef293e'] = value # UUID
uuid = value[0] fUuid = value[0]
case 'f__5008_bez_obj_nr_': case 'f__5008_bez_obj_nr_':
entityValues['f39d0e5207a375070d84b958017a62e8'] = value # Artifact Document Identifier entityValues['f39d0e5207a375070d84b958017a62e8'] = value # Artifact Document Identifier
case 'f__bebm_bem_beziehung': case 'f__bebm_bem_beziehung':
@ -79,10 +64,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created artifact relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact relation')

View file

@ -5,50 +5,38 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistAssignment(api, engine):
print('Initializing the database...') print('Importing artist assignment...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables tableName = "c__ob30_bez_kuenstler"
load_dotenv() bundleId = 'bc8826cc7d9c9373ce71cfc0251c2a4f'
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
try: try:
processedRows = pd.read_csv(f'./logs/processedArtistAssignment.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load sources table # Load sources table
artistRelationsTable = pd.read_sql_table('c__ob30_bez_kuenstler', con=engine) artistRelationsTable = pd.read_sql_table(tableName, con=engine)
artistRelationValues = {}
# Create artistRelations # Create artistRelations
for index, row in artistRelationsTable.iterrows(): for index, row in artistRelationsTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and artistRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and artistRelationsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed artistRelation {artistRelationsTable.loc[index, 'id']}') print(f'Skipping already processed artistAssignment {artistRelationsTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
artistRelationValues = {}
for key, value in row.items(): for key, value in row.items():
print('value: ', value)
# For every column in row... # For every column in row...
if (value is None) or (value == ''): if (value is None) or (value == ''):
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -71,13 +59,13 @@ for index, row in artistRelationsTable.iterrows():
print(f'{key} is not a valid field, skipping.') print(f'{key} is not a valid field, skipping.')
artistRelation = Entity(api=api, fields=artistRelationValues, bundle_id='bc8826cc7d9c9373ce71cfc0251c2a4f') artistRelation = Entity(api=api, fields=artistRelationValues, bundle_id=bundleId)
api.save(artistRelation) api.save(artistRelation)
print(f'Created artistRelation {index}: {artistRelation.uri} of {len(artistRelationsTable)}') print(f'Created artist assignment {index}: {artistRelation.uri} of {len(artistRelationsTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'][0], 'uri': artistRelation.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': artistRelationValues['fc150259d31fea8a3f992e7beb901fa4'][0], 'uri': artistRelation.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processedArtistAssignment.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artist assignment')

View file

@ -0,0 +1,82 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
def importMarkInformation(api, engine):
print('Importing mark information...')
tableName = "c__6760_markenart"
bundleId = 'bc7ce6906f78e760f22ff13226b1332d' # Mark information assignment
try:
processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'docId', 'uuid', 'uri'])
# Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine)
# Create entities
for index, row in sqlTable.iterrows():
# For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue
# Create Entity property dicts
entityValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
docId = ''
match key:
case 'id':
continue
case 'f__uuid':
entityValues['f3b8aaf7e79229b4da8214d491e375ec'] = value # UUID
fUuid = value[0]
case 'f__5064_num__dat_':
entityValues['fe6921098808e68cae68f0858411826c'] = value # Artist Assignment
case 'f__6894_anbr_ort':
entityValues['f694ed57271ab7be57249e0ee5c41ba4'] = value # Location
case 'f__6700_mar_dok_nr_':
entityValues['fdd3380d4a11654f32687429796cabc3'] = value # Mark Document Number
case 'f__6760_markenart':
entityValues['fd381aa9c3ebdf417e6cbccd60ede279'] = value # Mark Type
case 'f__684c_bedeutung_bz':
entityValues['f4947de52885f517baef0cdf3cb53b61'] = value # Meaning Inspection Mark
case 'f__684a_bedeutung_mz':
entityValues['f542c4c945725c6fdc5ab6409a877f02'] = value # Meaning Master Mark
case 'f__6770_rosenb_nr_':
entityValues['f0ff7020a9c25ea2706875837fe61b04'] = value # Rosenberg Number
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity)
print(f'Created mark information {index}: {entity.uri} of {len(sqlTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish')

View file

@ -1,90 +0,0 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
# Initialize the database
print('Initializing the database...')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('default')
tableName = "c__8490_fotograf"
bundleId = 'b821fb6c518948b7f40d17803b6ce293' # Photographer assignment
try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri'])
# Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine)
entityValues = {}
# Create entities
for index, row in sqlTable.iterrows():
# For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']:
# skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}')
continue
# Create Entity property dicts
entityValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
docId = ''
match key:
case 'id':
docId = value[0]
case 'f__uuid':
entityValues['f6c3c3e35af2f2073fd517aabf88fa7c'] = value # UUID
docUuid = value[0]
case 'f__8490_fotograf':
entityValues['fe8f8b235f896862b74caa0fa8f3682d'] = value # Photographer
case 'f__8494_aufn_datum':
entityValues['f12c7538643314f0f46ba76a5140a87d'] = value # Recording Date
case 'f__8470_aufnahmenr_':
entityValues['ff6ec986fb4cc5a2f34deb7144f2f817'] = value # Recording number
case 'f__849r_repro_datei': # Image Assignment
entityValues['f24a609593559a904a0a0f2e215db584'] = value # Reproduction Number
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}')
# Write log
processedRows = processedRows._append({'docId': docId, 'uuid': docUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False)
print('finish')

75
26_importPhotographer.py Normal file
View file

@ -0,0 +1,75 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
def importPhotographer(api, engine):
print('Importing photographer...')
tableName = "c__8490_fotograf"
bundleId = 'b821fb6c518948b7f40d17803b6ce293' # Photographer assignment
try:
processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine)
# Create entities
for index, row in sqlTable.iterrows():
# For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue
# Create Entity property dicts
entityValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
docId = ''
match key:
case 'id':
docId = value[0]
case 'f__uuid':
entityValues['f6c3c3e35af2f2073fd517aabf88fa7c'] = value # UUID
docUuid = value[0]
case 'f__8490_fotograf':
entityValues['fe8f8b235f896862b74caa0fa8f3682d'] = value # Photographer
case 'f__8494_aufn_datum':
entityValues['f12c7538643314f0f46ba76a5140a87d'] = value # Recording Date
case 'f__8470_aufnahmenr_':
entityValues['ff6ec986fb4cc5a2f34deb7144f2f817'] = value # Recording number
case 'f__849r_repro_datei': # Image Assignment
entityValues['f24a609593559a904a0a0f2e215db584'] = value # Reproduction Number
case _:
print(f'{key} is not a valid field, skipping.')
# Create Material
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity)
print(f'Created Photographer {index}: {entity.uri} of {len(sqlTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': docUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finished importing photographer')

View file

@ -5,31 +5,14 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToArtistRelationRelation(api, engine):
print('Initializing the database...') print('importing artifact to artist relation relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__ob30_bez_kuenstler" tableName = "r__obj__ob30_bez_kuenstler"
bundleId = 'b8b4e3b3fb7e3b83cec037aea51814bf' # Artifact to artist relation relation bundleId = 'b8b4e3b3fb7e3b83cec037aea51814bf' # Artifact to artist relation relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +22,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
for key, value in row.items(): for key, value in row.items():
@ -50,6 +33,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -73,10 +59,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created artifact to artist relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to artist relation relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToClientAssignmentRelation(api, engine):
print('Initializing the database...') print('importing artifact to client assignment relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__410a_auftraggeber" tableName = "r__obj__410a_auftraggeber"
bundleId = 'b20d53dcc2bad79457251a581611b43f' # Artifact to client assignment relation bundleId = 'b20d53dcc2bad79457251a581611b43f' # Artifact to client assignment relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,10 +62,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Client Assignment Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to client assignment relation')

View file

@ -5,32 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToInspectionMarkLocationRelation(api, engine):
print('Initializing the database...') print('importing artifact to inspection mark location relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__67b0_bz_dok_nr" tableName = "r__obj__67b0_bz_dok_nr"
bundleId = 'b7fe64e0326c107a1a4a705be08392fa' # Artifact to inspection mark location relation bundleId = 'b7fe64e0326c107a1a4a705be08392fa' # Artifact to inspection mark location relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
for key, value in row.items(): for key, value in row.items():
@ -51,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,10 +60,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Inspection Mark Location Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to inspection mark location relation')

View file

@ -5,31 +5,14 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToLiteratureReferenceAssignmentRelation(api, engine):
print('Initializing the database...') print('importing artifact to literature reference assignment relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__8330_lit_kurzt_" tableName = "r__obj__8330_lit_kurzt_"
bundleId = 'b6a7b7aad942ecff4b3beadf907d51c8' # Artifact to literature relation bundleId = 'b6a7b7aad942ecff4b3beadf907d51c8' # Artifact to literature relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +22,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +34,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,10 +60,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Literature Reference Assignment Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to literature reference assignment relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToMarkInformationAssignmentRelation(api, engine):
print('Initializing the database...') print('importing artifact to mark information assignment relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__6760_markenart" tableName = "r__obj__6760_markenart"
bundleId = 'b7112c2a7ea92a1d263d42d5572a05fc' # Artifact to mark information assignment relation bundleId = 'b7112c2a7ea92a1d263d42d5572a05fc' # Artifact to mark information assignment relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,10 +61,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Mark Information Assignment Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to mark information assignment relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToMaterialRelation(api, engine):
print('Initializing the database...') print('importing artifact to material relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__5280_material" tableName = "r__obj__5280_material"
bundleId = 'b825aff7df3d48bd875e2a081c796305' # Artifact to material relation bundleId = 'b825aff7df3d48bd875e2a081c796305' # Artifact to material relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,10 +61,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Material Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to material relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToNumericeDateRelation(api, engine):
print('Initializing the database...') print('importing artifact to numeric date relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__5064_num__dat_" tableName = "r__obj__5064_num__dat_"
bundleId = 'b795fcfa6c684fa707c236c4b0882ad7' # Artifact to numeric date relation bundleId = 'b795fcfa6c684fa707c236c4b0882ad7' # Artifact to numeric date relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,10 +62,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Numeric Date Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to numeric date relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToPhotographRelation(api, engine):
print('Initializing the database...') print('importing artifact to photograph relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__8490_fotograf" tableName = "r__obj__8490_fotograf"
bundleId = 'b63cd713e60b6e5bc3b2235dffc0dba9' # Artifact to photograph relation bundleId = 'b63cd713e60b6e5bc3b2235dffc0dba9' # Artifact to photograph relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,10 +61,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Photograph Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to photograph relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToRelationRelation(api, engine):
print('Initializing the database...') print('importing artifact to relation relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__5007_beziehung" tableName = "r__obj__5007_beziehung"
bundleId = 'bb878dd9c44c83a70fbd151f1dc06b4d' # Artifact to relation relation bundleId = 'bb878dd9c44c83a70fbd151f1dc06b4d' # Artifact to relation relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,10 +62,10 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Relation Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to relation relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToSourceRelation(api, engine):
print('Initializing the database...') print('importing artifact to source relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__obj__8130_que_kurzt_" tableName = "r__obj__8130_que_kurzt_"
bundleId = 'bcf720dc0b796043915d6da536414451' # Artifact to source relation bundleId = 'bcf720dc0b796043915d6da536414451' # Artifact to source relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,9 +61,9 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Source Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finish') print('finished importing artifact to source relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtifactToStatusAdministratorRelation(api, engine):
print('Initializing the database...') print('importing artifact to status administrator relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__obj__ob28_status_verwalt_" tableName = "r__obj__ob28_status_verwalt_"
bundleId = 'bd4922f100ab534fc1213f767770ed6d' # Artifact to status adminstrator relation bundleId = 'bd4922f100ab534fc1213f767770ed6d' # Artifact to status adminstrator relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -39,9 +23,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -51,6 +35,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -74,13 +61,13 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artifact to Status Administrator Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artifact to status administrator relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToBirthRelation(api, engine):
print('Initializing the database...') print('importing artist to birth relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__3270_geb_datum" tableName = "r__kue__3270_geb_datum"
bundleId = 'b82e4404cdf641db57c03d7e3b23947c' # Artist to birth relation bundleId = 'b82e4404cdf641db57c03d7e3b23947c' # Artist to birth relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,12 +63,12 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Birth Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to birth relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToDeathRelation(api, engine):
print('Initializing the database...') print('importing artist to death relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__3330_todes_dat_" tableName = "r__kue__3330_todes_dat_"
bundleId = 'b91ed11c8063a363063582f001a3f5a2' # Artist to death relation bundleId = 'b91ed11c8063a363063582f001a3f5a2' # Artist to death relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,11 +62,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Death Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to death relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToGoldsmithRelation(api, engine):
print('Initializing the database...') print('importing artist to goldsmith relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__3007_bezieh__zu_gs" tableName = "r__kue__3007_bezieh__zu_gs"
bundleId = 'b464b2b43aaa27aaba71e337c9af649c' # Artist to goldsmith relation bundleId = 'b464b2b43aaa27aaba71e337c9af649c' # Artist to goldsmith relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Goldsmith Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to goldsmith relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToLiteratureReferenceRelation(api, engine):
print('Initializing the database...') print('importing artist to literature reference relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__8330_lit_kurzt_" tableName = "r__kue__8330_lit_kurzt_"
bundleId = 'b7a87e3f3d5f671c1f163101bff30eb6' # Artist to literature relation bundleId = 'b7a87e3f3d5f671c1f163101bff30eb6' # Artist to literature relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Literature Reference Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to literature reference relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToMentionedRelation(api, engine):
print('Initializing the database...') print('importing artist to mentioned relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__7060_erwaehnt__datum_" tableName = "r__kue__7060_erwaehnt__datum_"
bundleId = 'bc2b0ddca583320a56a67b304dc0a045' # Artist to mentioned relation bundleId = 'bc2b0ddca583320a56a67b304dc0a045' # Artist to mentioned relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Mentioned Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to mentioned relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToOriginRelation(api, engine):
print('Initializing the database...') print('importing artist to origin relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__3204_herkunft" tableName = "r__kue__3204_herkunft"
bundleId = 'b5cf6b3e6fd2e4b5575da4347999d6ea' # Artist to origin relation bundleId = 'b5cf6b3e6fd2e4b5575da4347999d6ea' # Artist to origin relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,11 +62,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Origin Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to origin relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importArtistToWorkshopRelation(api, engine):
print('Initializing the database...') print('importing artist to workshop relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__kue__nfws_forts_werkst_" tableName = "r__kue__nfws_forts_werkst_"
bundleId = 'becb95326a733bdbd0c2dd3d36e3399d' # Artist to workshop relation bundleId = 'becb95326a733bdbd0c2dd3d36e3399d' # Artist to workshop relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Artist to Workshop Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing artist to workshop relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMarkDatingInformationAssignmentRelation(api, engine):
print('Initializing the database...') print('importing inspection mark dating information assignment relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__bez__68dm_datierung_marke" tableName = "r__bez__68dm_datierung_marke"
bundleId = 'b1fee832598b2d42ed17a927dad43b90' # Inspection Mark to dating information assignment relation bundleId = 'b1fee832598b2d42ed17a927dad43b90' # Inspection Mark to dating information assignment relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -76,11 +60,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Inspection Mark to Dating Information Assignment Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing inspection mark dating information assignment relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMarkRelationRelation(api, engine):
print('Initializing the database...') print('importing inspection mark relation relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__bez__67b7_beziehung" tableName = "r__bez__67b7_beziehung"
bundleId = 'bc8dcd233a9b539db407bad219715988' # Inspection Mark Relation Relation bundleId = 'bc8dcd233a9b539db407bad219715988' # Inspection Mark Relation Relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -77,11 +64,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Inspection Mark to Relation Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing inspection mark relation relation')

View file

@ -5,31 +5,15 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importInspectionMarkToLiteratureReferenceRelation(api, engine):
print('Initializing the database...') print('importing inspection mark to literature reference relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__bez__8330_lit_kurzt_" tableName = "r__bez__8330_lit_kurzt_"
bundleId = 'b32fc778865a1ffd5b165515425f38c6' # Inspection Mark to Dating Assignment bundleId = 'b32fc778865a1ffd5b165515425f38c6' # Inspection Mark to Dating Assignment
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uri']) processedRows = pd.DataFrame(columns=['docId', 'uri'])
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Inspection Mark to Literature Reference Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing inspection mark to literature reference relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importLiteratureToJournalRelation(api, engine):
print('Initializing the database...') print('importing literature to journal relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__lit__8310_zeitschrift" tableName = "r__lit__8310_zeitschrift"
bundleId = 'b6c2ce0add1e7999f48d66b7ef1a4a26' # Literature to journal relation bundleId = 'b6c2ce0add1e7999f48d66b7ef1a4a26' # Literature to journal relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Literature to Journal Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing literature to journal relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importLiteratureToParentPublicationRelation(api, engine):
print('Initializing the database...') print('importing literature to parent publication relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__lit__8292_uebergeordn_publ_" tableName = "r__lit__8292_uebergeordn_publ_"
bundleId = 'b2adaaa15714d83ea83cd3333af437df' # Literature to parent publication relation bundleId = 'b2adaaa15714d83ea83cd3333af437df' # Literature to parent publication relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Literature to Parent Publication Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing literature to parent publication relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarkToDatingRelation(api, engine):
print('Initializing the database...') print('importing mark to dating relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__mar__68dm_datierung_marke" tableName = "r__mar__68dm_datierung_marke"
bundleId = 'b105b749b25de3aa55329b82fe18c18d' # Mark to dating relation bundleId = 'b105b749b25de3aa55329b82fe18c18d' # Mark to dating relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -76,11 +60,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Mark to Dating Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing mark to dating relation')

View file

@ -5,34 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarkToLiteratureRelation(api, engine):
print('Initializing the database...') print('importing mark to literature relation')
engine, metadata, Session = initDb(True, './schemas/') test = False
if engine == False:
print('Database initialization failed.')
exit()
test = True
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
tableName = "r__mar__8330_lit_kurzt_" tableName = "r__mar__8330_lit_kurzt_"
bundleId = 'bd58cc7d59ce9f3e593e758a28dfcf4a' # Mark to literature relation bundleId = 'bd58cc7d59ce9f3e593e758a28dfcf4a' # Mark to literature relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -42,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -54,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -77,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Mark to Literature Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing mark to literature relation')

View file

@ -5,32 +5,16 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarkToMarkInformationRelation(api, engine):
print('Initializing the database...') print('importing mark to mark information relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__mar__6760_markenart" tableName = "r__mar__6760_markenart"
bundleId = 'b241e8063b9259428967fa4ff134a8bd' # Mark to mark information relation bundleId = 'b241e8063b9259428967fa4ff134a8bd' # Mark to mark information relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -40,9 +24,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -52,6 +36,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -75,11 +62,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Mark to Mark Information Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing mark to mark information relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importMarkToSourceRelation(api, engine):
print('Initializing the database...') print('importing mark to source relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__mar__8130_que_kurzt_" tableName = "r__mar__8130_que_kurzt_"
bundleId = 'b0edbf644e07765a5ae319802ec0289b' # Mark to source relation bundleId = 'b0edbf644e07765a5ae319802ec0289b' # Mark to source relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Mark to Source Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing mark to source relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importSourceToDateRelation(api, engine):
print('Initializing the database...') print('importing source to date relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__que__8100_datum" tableName = "r__que__8100_datum"
bundleId = 'b4b8ba242075bf2c778894911c7f3264' # Source to date relation bundleId = 'b4b8ba242075bf2c778894911c7f3264' # Source to date relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Source to Date Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing source to date relation')

View file

@ -5,33 +5,17 @@ import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling import pandas as pd # For dataframe handling
# Initialize the database def importSourceToLiteratureReferenceAssignmentRelation(api, engine):
print('Initializing the database...') print('importing source to literature reference assignment relation')
engine, metadata, Session = initDb(True, './schemas/')
if engine == False:
print('Database initialization failed.')
exit()
# Load the environment variables
load_dotenv()
# Initialize the WissKI API
print('Initializing the WissKI API...')
api_url = os.getenv('API_URL')
auth = (os.getenv('API_USERNAME'), os.getenv('API_PASSWORD'))
headers = {"Cache-Control": "no-cache"}
api = Api(api_url, auth, headers)
api.pathbuilder = api.get_pathbuilder('relations')
test = False test = False
tableName = "r__que__8330_lit_kurzt_" tableName = "r__que__8330_lit_kurzt_"
bundleId = 'bed2f320214a0344287c6c4db40e9331' # Source to literature reference assignemnt relation bundleId = 'bed2f320214a0344287c6c4db40e9331' # Source to literature reference assignemnt relation
try: try:
processedRows = pd.read_csv(f'./logs/processed-{tableName}.csv') processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError: except FileNotFoundError:
processedRows = pd.DataFrame(columns=['docId', 'uuid', 'uri']) processedRows = pd.DataFrame(columns=[ 'id', 'uuid', 'uri'])
# Load sources table # Load sources table
sqlTable = pd.read_sql_table(tableName, con=engine) sqlTable = pd.read_sql_table(tableName, con=engine)
@ -41,9 +25,9 @@ entityValues = {}
# Create entities # Create entities
for index, row in sqlTable.iterrows(): for index, row in sqlTable.iterrows():
# For every row in table... # For every row in table...
if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'docId']: if index < len(processedRows) and sqlTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed # skip if already processed
print(f'Skipping already processed entity {sqlTable.loc[index, 'id']}') print(f'Skipping already processed entity {sqlTable.loc[index, "id"]}')
continue continue
# Create Entity property dicts # Create Entity property dicts
entityValues = {} entityValues = {}
@ -53,6 +37,9 @@ for index, row in sqlTable.iterrows():
# skip if cell has no value # skip if cell has no value
continue continue
# Properties of an entity have to be an array, so... # Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value): if '&' in str(value):
# ...Explode "&"-separated values to array items # ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')] value = [x.strip() for x in str(value).split('&')]
@ -76,11 +63,11 @@ for index, row in sqlTable.iterrows():
entity = Entity(api=api, fields=entityValues, bundle_id=bundleId) entity = Entity(api=api, fields=entityValues, bundle_id=bundleId)
api.save(entity) api.save(entity)
print(f'Created entity {index}: {entity.uri} of {len(sqlTable)}') print(f'Created Source to Literature Reference Assignment Relation {index}: {entity.uri} of {len(sqlTable)}')
# Write log # Write log
processedRows = processedRows._append({'docId': docId, 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True) processedRows = processedRows._append({'id': row['id'], 'uuid': fUuid, 'uri': entity.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/processed-{tableName}.csv', index=False) processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
if test: if test:
exit() exit()
print('finish') print('finished importing source to literature reference assignment relation')

View file

@ -19,7 +19,7 @@ def initDb(_production, schemaDir):
return (False, False) return (False, False)
if _production: if _production:
dbName = 'ngk' dbName = 'ngk_data_alt'
else: else:
dbName = 'testngk' dbName = 'testngk'

View file

@ -20,7 +20,7 @@ def createClass(name, columns):
tableName = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_') tableName = name.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')', '_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_')
# Transform columns and add prefix # Transform columns and add prefix
attrs = {'__tablename__': tableName} attrs = {'__tablename__': tableName, '__table_args__': {'extend_existing': True}}
attrs.update({prop.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')','_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_'): (Column(String(36), primary_key=True) if prop.lower() == 'uuid' else Column(Text)) for prop in columns}) attrs.update({prop.lower().replace('-', '_').replace('.', '_').replace(' ', '_').replace('(', '_').replace(')','_').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss').replace('?', '_'): (Column(String(36), primary_key=True) if prop.lower() == 'uuid' else Column(Text)) for prop in columns})
# If 'uuid' is not in columns, add 'id' as primary key # If 'uuid' is not in columns, add 'id' as primary key
@ -30,9 +30,6 @@ def createClass(name, columns):
# Create SQLAlchemy class # Create SQLAlchemy class
cls = type(className, (Base,), attrs) cls = type(className, (Base,), attrs)
# Define the table with extend_existing=True
Table(tableName, Base.metadata, extend_existing=True)
return cls return cls
def initClassesFromSchemas(schemaDir): def initClassesFromSchemas(schemaDir):

View file

@ -3,4 +3,4 @@ pandas
pymysql pymysql
sqlalchemy sqlalchemy
tqdm tqdm
wisski_py