wisski_database_checker/test-wisski-data-integrity.php
2025-12-09 10:49:31 +01:00

289 lines
8.8 KiB
PHP

<?php
/**
* @file
* Script to test WissKI table data integrity before and after schema updates.
*
* Usage:
* php test-wisski-data-integrity.php export # Export data before update
* php test-wisski-data-integrity.php verify # Verify data after update
*/
// Bootstrap Drupal.
$autoloader = require_once __DIR__ . '/autoload.php';
$request = \Symfony\Component\HttpFoundation\Request::createFromGlobals();
$kernel = \Drupal\Core\DrupalKernel::createFromRequest($request, $autoloader, 'prod');
$kernel->boot();
// Database connection.
$connection = \Drupal::database();
$outputDir = __DIR__ . '/wisski-data-integrity-test';
// Get all WissKI tables.
function getWisskiTables($connection) {
$tables = [];
// Query information_schema to get all WissKI tables.
$database = $connection->getConnectionOptions();
$dbName = $database['database'];
$result = $connection->query("SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = :db AND TABLE_NAME LIKE 'wisski%'", [
':db' => $dbName,
]);
while ($row = $result->fetchAssoc()) {
$tables[] = $row['TABLE_NAME'];
}
return $tables;
}
// Export table data: row count, checksum, sample rows.
function exportTableData($connection, $tableName, $outputDir) {
$data = [
'table' => $tableName,
'timestamp' => date('Y-m-d H:i:s'),
'row_count' => 0,
'checksum' => '',
'sample_rows' => [],
'column_info' => [],
];
// Get row count.
$countResult = $connection->query("SELECT COUNT(*) as cnt FROM {" . $tableName . "}");
$data['row_count'] = (int) $countResult->fetchField();
if ($data['row_count'] === 0) {
$data['checksum'] = 'empty';
return $data;
}
// Get column information.
$columnsResult = $connection->query("SHOW COLUMNS FROM {" . $tableName . "}");
$columns = [];
while ($col = $columnsResult->fetchAssoc()) {
$columns[] = $col['Field'];
}
$data['column_info'] = $columns;
// Get sample rows (first 10 rows).
$sampleResult = $connection->query("SELECT * FROM {" . $tableName . "} LIMIT 10");
$samples = [];
while ($row = $sampleResult->fetchAssoc()) {
$samples[] = $row;
}
$data['sample_rows'] = $samples;
// Calculate checksum of all data.
// For large tables, we'll use a hash of all row data concatenated.
// Build checksum query - use backticks for column names.
$checksumFields = [];
foreach ($columns as $col) {
$checksumFields[] = "COALESCE(CAST(`" . $col . "` AS CHAR), '')";
}
$orderByCols = array_slice($columns, 0, min(5, count($columns)));
$orderBy = implode(', ', array_map(function($col) {
return "`" . $col . "`";
}, $orderByCols));
// Use CONCAT_WS for safer concatenation.
$concatExpr = "CONCAT_WS('|', " . implode(', ', $checksumFields) . ")";
$checksumQuery = "SELECT MD5(GROUP_CONCAT(" . $concatExpr .
" ORDER BY " . $orderBy . " SEPARATOR '|||')) as chksum FROM {" . $tableName . "}";
try {
$checksumResult = $connection->query($checksumQuery);
$data['checksum'] = $checksumResult->fetchField() ?: 'no_checksum';
}
catch (\Exception $e) {
// If checksum fails (e.g., GROUP_CONCAT too large), use row count + first row hash.
$data['checksum'] = 'partial_' . md5($data['row_count'] . serialize($samples[0] ?? []));
}
return $data;
}
// Export all WissKI tables.
function exportAllTables($connection, $outputDir) {
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, TRUE);
}
// Delete old JSON files before exporting new data.
if (is_dir($outputDir)) {
$files = glob($outputDir . '/*.json');
$deletedCount = 0;
foreach ($files as $file) {
if (is_file($file)) {
unlink($file);
$deletedCount++;
}
}
if ($deletedCount > 0) {
echo "Deleted {$deletedCount} old JSON file(s) from previous export.\n";
}
}
$tables = getWisskiTables($connection);
$allData = [
'export_timestamp' => date('Y-m-d H:i:s'),
'tables' => [],
];
echo "Exporting data for " . count($tables) . " WissKI tables...\n";
foreach ($tables as $table) {
echo " Processing {$table}...\n";
$tableData = exportTableData($connection, $table, $outputDir);
$allData['tables'][$table] = $tableData;
// Save individual table export.
$tableFile = $outputDir . '/' . $table . '.json';
file_put_contents($tableFile, json_encode($tableData, JSON_PRETTY_PRINT));
}
// Save summary.
$summaryFile = $outputDir . '/before-update-summary.json';
file_put_contents($summaryFile, json_encode($allData, JSON_PRETTY_PRINT));
echo "\nExport complete! Data saved to: {$outputDir}\n";
echo "Summary: {$summaryFile}\n";
return $allData;
}
// Verify data integrity after update.
function verifyDataIntegrity($connection, $outputDir) {
$summaryFile = $outputDir . '/before-update-summary.json';
if (!file_exists($summaryFile)) {
echo "ERROR: Before-update summary not found at: {$summaryFile}\n";
echo "Please run 'export' first.\n";
return FALSE;
}
$beforeData = json_decode(file_get_contents($summaryFile), TRUE);
$issues = [];
$verified = [];
echo "Verifying data integrity for " . count($beforeData['tables']) . " tables...\n\n";
foreach ($beforeData['tables'] as $tableName => $beforeTableData) {
echo "Checking {$tableName}...\n";
$afterTableData = exportTableData($connection, $tableName, $outputDir);
// Check row count.
if ($afterTableData['row_count'] !== $beforeTableData['row_count']) {
$issues[] = [
'table' => $tableName,
'issue' => 'row_count_mismatch',
'before' => $beforeTableData['row_count'],
'after' => $afterTableData['row_count'],
'difference' => $afterTableData['row_count'] - $beforeTableData['row_count'],
];
echo " ⚠️ ROW COUNT MISMATCH: Before={$beforeTableData['row_count']}, After={$afterTableData['row_count']}\n";
}
else {
echo " ✓ Row count matches: {$afterTableData['row_count']}\n";
}
// Check checksum.
if ($beforeTableData['checksum'] !== 'empty' && $afterTableData['checksum'] !== 'empty') {
if ($beforeTableData['checksum'] !== $afterTableData['checksum']) {
$issues[] = [
'table' => $tableName,
'issue' => 'checksum_mismatch',
'before' => $beforeTableData['checksum'],
'after' => $afterTableData['checksum'],
];
echo " ⚠️ CHECKSUM MISMATCH: Data may have changed!\n";
}
else {
echo " ✓ Checksum matches\n";
}
}
// Compare sample rows if available.
if (!empty($beforeTableData['sample_rows']) && !empty($afterTableData['sample_rows'])) {
$sampleMatch = TRUE;
$minSamples = min(count($beforeTableData['sample_rows']), count($afterTableData['sample_rows']));
for ($i = 0; $i < $minSamples; $i++) {
if ($beforeTableData['sample_rows'][$i] !== $afterTableData['sample_rows'][$i]) {
$sampleMatch = FALSE;
break;
}
}
if (!$sampleMatch) {
$issues[] = [
'table' => $tableName,
'issue' => 'sample_data_mismatch',
];
echo " ⚠️ Sample data differs!\n";
}
else {
echo " ✓ Sample data matches\n";
}
}
if (empty(array_filter($issues, function($issue) use ($tableName) {
return $issue['table'] === $tableName;
}))) {
$verified[] = $tableName;
}
echo "\n";
}
// Save verification report.
$report = [
'verification_timestamp' => date('Y-m-d H:i:s'),
'tables_verified' => count($verified),
'tables_with_issues' => count($issues),
'verified_tables' => $verified,
'issues' => $issues,
];
$reportFile = $outputDir . '/verification-report.json';
file_put_contents($reportFile, json_encode($report, JSON_PRETTY_PRINT));
// Print summary.
echo "\n" . str_repeat('=', 60) . "\n";
echo "VERIFICATION SUMMARY\n";
echo str_repeat('=', 60) . "\n";
echo "Tables verified: " . count($verified) . "\n";
echo "Tables with issues: " . count($issues) . "\n";
if (!empty($issues)) {
echo "\n⚠️ ISSUES FOUND:\n";
foreach ($issues as $issue) {
echo " - {$issue['table']}: {$issue['issue']}\n";
if (isset($issue['difference'])) {
echo " Difference: {$issue['difference']} rows\n";
}
}
echo "\nFull report: {$reportFile}\n";
return FALSE;
}
else {
echo "\n✓ All tables verified successfully! No data loss detected.\n";
echo "Report: {$reportFile}\n";
return TRUE;
}
}
// Main execution.
$command = $argv[1] ?? 'help';
switch ($command) {
case 'export':
exportAllTables($connection, $outputDir);
break;
case 'verify':
verifyDataIntegrity($connection, $outputDir);
break;
default:
echo "Usage:\n";
echo " php test-wisski-data-integrity.php export # Export data before update\n";
echo " php test-wisski-data-integrity.php verify # Verify data after update\n";
break;
}