new commit

This commit is contained in:
rnsrk 2025-09-09 10:16:31 +02:00
parent da296f8a64
commit e46a9fd4ec
69 changed files with 4199 additions and 4805 deletions

207
21_importArtifacts.py Normal file
View file

@ -0,0 +1,207 @@
import uuid # For UUID creation
from initDb import initDb # For database initialization
from wisski.api import Api, Pathbuilder, Entity # For WissKI API
import os # For environment variable loading
from dotenv import load_dotenv # For environment variable loading
import pandas as pd # For dataframe handling
def importArtifacts(api, engine):
print('Importing artifacts...')
tableName = "c__obj"
bundleId = 'bd30c2c64a3caa8bb1628c780c3f24bb'
try:
processedRows = pd.read_csv(f'./logs/{tableName}.csv')
except FileNotFoundError:
processedRows = pd.DataFrame(columns=['id', 'uuid', 'uri'])
# Load artifacts table
artifactsTable = pd.read_sql_table(tableName, con=engine)
# Create artifacts
for index, row in artifactsTable.iterrows():
# For every row in table...
if index < len(processedRows) and artifactsTable.loc[index, 'id'] == processedRows.loc[index, 'id']:
# skip if already processed
print(f'Skipping already processed artifact {artifactsTable.loc[index, "id"]}')
continue
# Create Entity property dicts
artifactValues = {}
creationValues = {}
digitisationProcessValues = {'f32274ec0032b8778ba69d20108590cc': [str(uuid.uuid4())]}
imageValues = {}
imageAssignmentValues = {'f067784f5b1ff850672124a2b05360de': [str(uuid.uuid4())]}
productionPlaceAssignmentValues = {'f40cc95db3ccaa1dbbf27294338d9f07': [str(uuid.uuid4())]}
dimensionValues = {}
for key, value in row.items():
# For every column in row...
if (value is None) or (value == ''):
# skip if cell has no value
continue
# Properties of an entity have to be an array, so...
value = str(value).replace('&###{{new_line}}###'.format(), '&')
value = str(value).replace('###{{new_line}}###', '&')
value = str(value).replace(' & ', '&')
value = str(value).replace(' & ', '&')
if '&' in str(value):
# ...Explode "&"-separated values to array items
value = [x.strip() for x in str(value).split('&')]
else:
# ...Or parse to array
value = [value]
# Map columns to fields. We use assignments for reification.
match key:
case 'id':
continue
case 'f__uuid':
artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'] = value # UUID
case 'f__5000_obj_dok_nr_':
artifactValues['f7e2a8a273ab3d577bf5854902550c09'] = value # Document Identifier
docId = value[0]
case 'f__500n_ngk_nr_':
artifactValues['f6e041bd0b16b21596849732c01cb168'] = value # NGK Number
case 'f__5130_entst_ort':
# We map productions place to Production Place Assignment entity.
productionPlaceAssignmentValues['f43f9589eef324fb12c26226dfe94246'] = value # Production Place
case 'f__5200_obj_titel':
artifactValues['fd06dcc49a29b1a63fa4a789ec17e5c6'] = value # Title
case 'f__5210_status':
artifactValues['f35c9c9b0991729c36acb41645fe81d1'] = value # Status
case 'f__5220_gattung':
artifactValues['f2fd7f8a81d5eb1a20371b9acfd1ab59'] = value # Genre
case 'f__5223_form__attribut':
artifactValues['f05bbd6e29a7d303e4370b04c12b3f75'] = value # Formattribute
case 'f__5226_art':
artifactValues['f593fa773a6ea458101ba2325a18abbe'] = value # artifact type
case 'f__523f_funktion':
artifactValues['f476ba24127d4dff1018acebf45a05f6'] = value # Function
case 'f__5240_formtyp':
artifactValues['fa7cfd9dbb3d2517c1898b3051d8dbed'] = value # Shape
case 'f__524g_gestalt':
artifactValues['f8309a21fa79bc6bd2506060b419d2df'] = value # Figure
case 'f__5362_hoehe':
# We map dimensions to Dimension entity.
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['height'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5364_breite':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['width'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5366_tiefe':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['depth'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5368_laenge':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['length'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5370_durchmesser':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['diameter'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__5380_gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__538h_hist__gewicht':
dimensionValues.setdefault(key, {})['f31e9c7e2de5549daea1790a74615288'] = ['historical_weight'] # Type
dimensionValues[key]['f3f805d270890837a6493e7e60a96487'] = value # Dimension
dimensionValues[key]['f802fd7bf45be523a9b188411a591420'] = [str(uuid.uuid4())] # UUID
case 'f__55ng_darst__schlagw_':
artifactValues['f6abbd4f39a6f79de5de2b14b98e51ff'] = value # Keywords
case 'f__5bes_beschreibung':
artifactValues['f26ad2bc1f084478cd7011f7b8451526'] = value # Description
case 'f__5ges_geschichte':
artifactValues['f40120d7c13ef02b486c69245f6c2306'] = value # History
case 'f__68an_abdruck_nr_':
artifactValues['fd3740649cc06f45677eb0546908cdac'] = value # Print Number
case 'f__8540_repro_nr_':
# We map images to Image entity
for item in value:
if item is not None:
# Replace dir paths in name
item = item.replace('Objekte/', 'objects/')
item = item.replace('Objekte\\', 'objects/')
item = item.replace('Objekte3\\', 'objects/')
item = item.replace('Objekte4\\', 'objects/')
item = item.replace('objekte4\\', 'objects/')
item = item.replace('Objekte5\\', 'objects/')
item = item.replace('objekte5\\', 'objects/')
item = item.replace('Marken\\', 'marks/')
item = item.replace('Marken/', 'marks/')
imageValues.setdefault(item, {})['feb10344eaa7a5f414d1e8392853eba9'] = [item] # Reproduction Number (Image)
imageValues[item]['fc8d57e55f203c75c2f8a1ae79378ac7'] = ['public://artifact_images/' + item + '.jpg'] # File
imageValues[item]['f11beac4b638016479e6f3fbc7e55d1a'] = [str(uuid.uuid4())] # UUID
case 'f__stwv_statwerkverz':
artifactValues['fee0db94d62fae6370a89ff4757ff539'] = value # Catalogue_of_Works
case 'f__9990_kommentar':
artifactValues['fefe289aa0c9563a153be6da7d37e3ff'] = value # Comment
case 'f__9900_datum_erfassung':
digitisationProcessValues['f1f5dd22371e5c1de41e0fb099e0e862'] = value # Recording date
case 'f__99ae_datum_aenderung':
digitisationProcessValues['f8976c6a9e5d91fe9caba8a57c27f204'] = value # Change date
case 'f__efbm_bem_erfassung':
digitisationProcessValues['f78a6310d13c717b82ddba814ac59024'] = value # Recording note
case 'f__ptxt_plug_in_text':
artifactValues['ffb8b04e8d57929a596fc32d6a84d07d'] = value # Plugin text
case _:
print(f'{key} is not a valid field, skipping.')
# Create Production Place Assignment
productionPlaceAssignment = Entity(api=api, fields=productionPlaceAssignmentValues, bundle_id='b13bc6dc04d4bbdafb9536987eb43244')
api.save(productionPlaceAssignment) # Kai says, we can save all entities at once, but I save it instantly
# Create Dimension entities and add their UUIDs to a list
# because we link Artifact and Dimension over the UUID
dimension = []
for key, value in dimensionValues.items():
dimensionItem = Entity(api=api, fields=value, bundle_id='b73258adf62f35bd1be3fa2863fab558')
api.save(dimensionItem)
dimension.append(value['f802fd7bf45be523a9b188411a591420'][0])
# Create Image entities and add their UUIDs to a list
# because we link Image Assignment and Image over the UUID
imageList = []
for key, value in imageValues.items():
imageItem = Entity(api=api, fields=value, bundle_id='b8c6c4b478ead1c80e175ad0f98dafe3')
api.save(imageItem)
imageList.append(value['f11beac4b638016479e6f3fbc7e55d1a'][0])
# Create Image Assignment entities and add their UUIDs to a list
# because we link Artifact and Image Assignment over the UUID
if imageList:
imageAssignmentValues['f70afb79b45472fee3d02f011caa4b36'] = imageList # List of Image UUIDs
imageAssignment = Entity(api=api, fields=imageAssignmentValues, bundle_id='b88e5d94fb2a83d62df99cf64d6c010c')
api.save(imageAssignment)
# Create Digitisation Process
digitisationProcess = Entity(api=api, fields=digitisationProcessValues, bundle_id='b22e6c47ccb3ab8a974b37279e1bc33b')
api.save(digitisationProcess)
# Add the field values for reference
# UWAGA! Is the Value Production Place Assignment Correct? UWAGA!
artifactValues['f2676a0fb8db6ab62235328ae7c7a4b3'] = [productionPlaceAssignmentValues['f40cc95db3ccaa1dbbf27294338d9f07'][0]] # Production Place Assignment
artifactValues['fc700eb3f24f4f2a6c165128aa7117f1'] = dimension # Dimension
artifactValues['f7af1cd9c77448281dd7ecf29ba57e3e'] = [imageAssignmentValues['f067784f5b1ff850672124a2b05360de'][0]] # Image Assignment
artifactValues['f5a3f90d920da3db4cfdbaa6264b0e89'] = [digitisationProcessValues['f32274ec0032b8778ba69d20108590cc'][0]] # Digitisation Process
# Create Artifact
artifact = Entity(api=api, fields=artifactValues, bundle_id=bundleId)
api.save(artifact)
print(f'Created artifact {index}: {artifact.uri} of {len(artifactsTable)}')
# Write log
processedRows = processedRows._append({'id': row['id'], 'uuid': artifactValues['feb48c9a7efc444449b4b8defcd6d8bd'][0], 'uri': artifact.uri}, ignore_index=True)
processedRows.to_csv(f'./logs/{tableName}.csv', index=False)
print('finished importing artifacts')