open-productive-stack/diagnostic.sh

515 lines
17 KiB
Bash
Executable file

#!/bin/bash
# Colors for output.
green='\033[0;32m'
yellow='\033[1;33m'
red='\033[0;31m'
blue='\033[0;34m'
noColor='\033[0m'
printSection() {
echo -e "\n${blue}--- $1 ---${noColor}"
}
checkStatus() {
local status="$1"
local message="$2"
if [ "$status" = "0" ] || [ "$status" = "true" ] || [ "$status" = "OK" ] || [ "$status" = "healthy" ]; then
echo -e "${green}${noColor} $message"
return 0
fi
echo -e "${red}${noColor} $message"
return 1
}
warnStatus() {
local message="$1"
echo -e "${yellow}${noColor} $message"
}
checkService() {
local service="$1"
echo -e "${yellow}Checking if ${service} is running...${noColor}"
if docker ps | grep -q "$service"; then
echo -e "${green}${service} is running.${noColor}"
return 0
fi
echo -e "${red}${service} is not running.${noColor}"
return 1
}
checkConnectivity() {
local service="$1"
local port="$2"
local host="${3:-localhost}"
echo -e "${yellow}Checking connectivity to ${service} on ${host}:${port}...${noColor}"
if nc -z -v -w5 "$host" "$port" 2>/dev/null; then
echo -e "${green}Connection to ${service} on ${host}:${port} successful.${noColor}"
return 0
fi
echo -e "${red}Cannot connect to ${service} on ${host}:${port}.${noColor}"
return 1
}
loadCoreDomain() {
local coreEnvFile="/var/deploy/core/.env"
if [ -f "$coreEnvFile" ]; then
# shellcheck disable=SC1090
source "$coreEnvFile"
echo -e "${yellow}Domain configuration:${noColor} ${DOMAIN}"
return 0
fi
echo -e "${red}Core .env file not found.${noColor}"
DOMAIN="example.com"
return 1
}
loadMailcowHostname() {
local mailcowConfFile="/var/deploy/mailcow/mailcow.conf"
if [ -f "$mailcowConfFile" ]; then
mailcowHostname=$(grep '^MAILCOW_HOSTNAME=' "$mailcowConfFile" | cut -d= -f2)
return 0
fi
mailcowHostname=""
return 1
}
checkTraefik() {
echo -e "${yellow}Checking Traefik configuration...${noColor}"
checkService "traefik" || return 1
checkConnectivity "Traefik HTTP" 80 || echo -e "${red}Traefik HTTP port not accessible.${noColor}"
checkConnectivity "Traefik HTTPS" 443 || echo -e "${red}Traefik HTTPS port not accessible.${noColor}"
checkConnectivity "Traefik SSH" 2424 || echo -e "${red}Traefik SSH port not accessible.${noColor}"
echo -e "${yellow}Checking Traefik certificates...${noColor}"
if docker exec traefik ls -la /certificates/acme.json >/dev/null 2>&1; then
echo -e "${green}Traefik certificates found.${noColor}"
else
echo -e "${red}Traefik certificates not found.${noColor}"
fi
return 0
}
checkForgejo() {
echo -e "${yellow}Checking Forgejo configuration...${noColor}"
checkService "forgejo" || return 1
docker exec forgejo grep -q "ROOT_URL" /data/gitea/conf/app.ini && \
echo -e "${green}Forgejo root URL is configured.${noColor}" || \
echo -e "${red}Forgejo root URL is not configured.${noColor}"
docker exec forgejo grep -q "SSH_PORT" /data/gitea/conf/app.ini && \
echo -e "${green}Forgejo SSH port is configured.${noColor}" || \
echo -e "${red}Forgejo SSH port is not configured.${noColor}"
echo -e "${yellow}Checking Forgejo SSH connection...${noColor}"
if ssh -T git@git.${DOMAIN} -p 2424 -o StrictHostKeyChecking=no -o BatchMode=yes &>/dev/null; then
echo -e "${green}Forgejo SSH connection successful.${noColor}"
else
echo -e "${red}Forgejo SSH connection failed. This is expected if you haven't set up SSH keys yet.${noColor}"
echo -e "${yellow}Try: ssh -vT git@git.${DOMAIN} -p 2424${noColor}"
fi
return 0
}
checkDatabases() {
echo -e "${yellow}Checking database services...${noColor}"
checkService "mariadb" && \
echo -e "${green}MariaDB is running.${noColor}" || \
echo -e "${red}MariaDB is not running.${noColor}"
checkService "postgres" && \
echo -e "${green}PostgreSQL is running.${noColor}" || \
echo -e "${red}PostgreSQL is not running.${noColor}"
return 0
}
checkNextcloud() {
echo -e "${yellow}Checking Nextcloud configuration...${noColor}"
checkService "nextcloud" || return 1
checkService "nextcloud-redis" || return 1
checkService "nextcloud-reverse-proxy" || return 1
echo -e "${yellow}Checking Nextcloud status...${noColor}"
if docker exec nextcloud php /var/www/html/occ status 2>&1 | grep -q "installed: true"; then
echo -e "${green}Nextcloud is installed and operational.${noColor}"
docker exec nextcloud php /var/www/html/occ status 2>&1 | grep -E "version|maintenance" | sed 's/^/ /'
else
echo -e "${red}Nextcloud is not properly installed.${noColor}"
return 1
fi
echo -e "${yellow}Checking Redis connectivity...${noColor}"
if docker exec nextcloud-redis redis-cli ping 2>&1 | grep -q "PONG"; then
echo -e "${green}Redis is responding.${noColor}"
else
echo -e "${red}Redis is not responding.${noColor}"
return 1
fi
echo -e "${yellow}Checking database collation...${noColor}"
local collationCheck
collationCheck=$(docker exec nextcloud php /var/www/html/occ status 2>&1 | grep -i "collation")
if [ -n "$collationCheck" ]; then
echo -e "${yellow}Database collation version mismatch detected.${noColor}"
echo -e "${yellow}Run: ./nextcloud-maintenance.sh collation${noColor}"
else
echo -e "${green}Database collation is up to date.${noColor}"
fi
return 0
}
checkAllServices() {
echo -e "${yellow}Checking all services...${noColor}"
local services=("traefik" "forgejo" "mariadb" "postgres" "adminer" "nextcloud" "onlyoffice" "openproject" "hedgedoc" "drupal")
for service in "${services[@]}"; do
checkService "$service"
done
return 0
}
checkMailcowServices() {
printSection "1. Service Health Status"
echo -e "${yellow}Checking Docker services...${noColor}"
local traefikStatus
traefikStatus=$(docker ps --filter "name=traefik" --format "{{.Status}}" | grep -q "Up" && echo "OK" || echo "FAIL")
checkStatus "$traefikStatus" "Traefik is running"
local mailcowServices
local totalMailcow
mailcowServices=$(cd /var/deploy/mailcow && docker compose ps --format json 2>/dev/null | jq -r '.State' | grep -c "running" 2>/dev/null || echo "0")
totalMailcow=$(cd /var/deploy/mailcow && docker compose ps --format json 2>/dev/null | jq -r '.State' | wc -l 2>/dev/null || echo "0")
if [ "$mailcowServices" = "$totalMailcow" ] && [ "$totalMailcow" -gt 0 ]; then
checkStatus "OK" "All Mailcow services running (${mailcowServices}/${totalMailcow})"
else
checkStatus "FAIL" "Some Mailcow services not running (${mailcowServices}/${totalMailcow})"
fi
local criticalServices=("nginx-mailcow" "postfix-mailcow" "dovecot-mailcow" "mysql-mailcow" "acme-mailcow" "watchdog-mailcow")
for service in "${criticalServices[@]}"; do
local serviceStatus
serviceStatus=$(cd /var/deploy/mailcow && docker compose ps --format json 2>/dev/null | jq -r "select(.Service==\"$service\") | .State" | head -1)
if [ "$serviceStatus" = "running" ]; then
checkStatus "OK" "${service} is running"
else
checkStatus "FAIL" "${service} is not running"
fi
done
}
checkSslAndCerts() {
printSection "2. SSL/TLS Configuration"
echo -e "${yellow}Traefik SSL Configuration:${noColor}"
local traefikHttpChallenge
local traefikTlsChallenge
traefikHttpChallenge=$(docker exec traefik cat /proc/1/cmdline 2>/dev/null | tr '\0' '\n' | grep -q "httpchallenge" && echo "OK" || echo "FAIL")
traefikTlsChallenge=$(docker exec traefik cat /proc/1/cmdline 2>/dev/null | tr '\0' '\n' | grep -q "tlschallenge" && echo "OK" || echo "FAIL")
checkStatus "$traefikHttpChallenge" "Traefik HTTP challenge configured"
checkStatus "$traefikTlsChallenge" "Traefik TLS-ALPN-01 challenge configured"
local acmeFile
acmeFile=$(docker exec traefik test -f /certificates/acme.json && echo "OK" || echo "FAIL")
checkStatus "$acmeFile" "Traefik acme.json exists"
local certCount
certCount=$(docker exec traefik cat /certificates/acme.json 2>/dev/null | jq -r '.le.Certificates | length' 2>/dev/null || echo "0")
if [ "$certCount" -gt 0 ]; then
checkStatus "OK" "Traefik has ${certCount} certificate(s) stored"
else
checkStatus "FAIL" "Traefik has no certificates"
fi
local mailcowCert
local mailcowKey
mailcowCert=$(cd /var/deploy/mailcow && test -f data/assets/ssl/cert.pem && echo "OK" || echo "FAIL")
mailcowKey=$(cd /var/deploy/mailcow && test -f data/assets/ssl/key.pem && echo "OK" || echo "FAIL")
checkStatus "$mailcowCert" "Mailcow certificate file exists"
checkStatus "$mailcowKey" "Mailcow private key exists"
if [ -f /var/deploy/mailcow/data/assets/ssl/cert.pem ]; then
local certExpiry
local certExpiryEpoch
local currentEpoch
local daysUntilExpiry
certExpiry=$(openssl x509 -in /var/deploy/mailcow/data/assets/ssl/cert.pem -noout -enddate 2>/dev/null | cut -d= -f2)
certExpiryEpoch=$(date -d "$certExpiry" +%s 2>/dev/null || echo "0")
currentEpoch=$(date +%s)
daysUntilExpiry=$(( (certExpiryEpoch - currentEpoch) / 86400 ))
if [ "$daysUntilExpiry" -gt 30 ]; then
checkStatus "OK" "Certificate valid for ${daysUntilExpiry} more days (expires: ${certExpiry})"
elif [ "$daysUntilExpiry" -gt 0 ]; then
warnStatus "Certificate expires in ${daysUntilExpiry} days (expires: ${certExpiry})"
else
checkStatus "FAIL" "Certificate expired on ${certExpiry}"
fi
fi
if [ -n "$mailcowHostname" ]; then
local httpsResponse
local certChain
httpsResponse=$(curl -sI "https://${mailcowHostname}" 2>&1 | head -1 | grep -q "HTTP" && echo "OK" || echo "FAIL")
certChain=$(echo | openssl s_client -connect "${mailcowHostname}:443" -servername "${mailcowHostname}" 2>/dev/null | openssl x509 -noout -issuer 2>/dev/null | grep -q "Let's Encrypt" && echo "OK" || echo "FAIL")
checkStatus "$httpsResponse" "HTTPS connectivity to ${mailcowHostname}"
checkStatus "$certChain" "Certificate issued by Let's Encrypt"
else
warnStatus "MAILCOW_HOSTNAME not found, skipping HTTPS checks"
fi
}
checkMailcowConfig() {
printSection "3. Mailcow Configuration"
if [ -f /var/deploy/mailcow/mailcow.conf ]; then
checkStatus "OK" "mailcow.conf exists"
local skipLe
local useWatchdog
local httpRedirect
skipLe=$(grep "^SKIP_LETS_ENCRYPT=" /var/deploy/mailcow/mailcow.conf | cut -d= -f2)
if [ "$skipLe" = "n" ]; then
checkStatus "OK" "Let's Encrypt enabled in mailcow"
else
warnStatus "Let's Encrypt disabled in mailcow (expected with Traefik)"
fi
useWatchdog=$(grep "^USE_WATCHDOG=" /var/deploy/mailcow/mailcow.conf | cut -d= -f2)
if [ "$useWatchdog" = "y" ]; then
checkStatus "OK" "Watchdog enabled"
else
warnStatus "Watchdog disabled"
fi
httpRedirect=$(grep "^HTTP_REDIRECT=" /var/deploy/mailcow/mailcow.conf | cut -d= -f2)
if [ "$httpRedirect" = "y" ]; then
checkStatus "OK" "HTTP to HTTPS redirect enabled"
else
warnStatus "HTTP to HTTPS redirect disabled"
fi
else
checkStatus "FAIL" "mailcow.conf not found"
fi
local certdumperStatus
certdumperStatus=$(cd /var/deploy/mailcow && docker compose ps --format json 2>/dev/null | jq -r "select(.Service==\"certdumper\") | .State" | head -1)
if [ "$certdumperStatus" = "running" ]; then
checkStatus "OK" "Certdumper service running (syncs Traefik certs to mailcow)"
else
checkStatus "FAIL" "Certdumper service not running"
fi
}
checkAcmeLogs() {
printSection "4. SSL Challenge Status"
local skipLe
skipLe=$(grep "^SKIP_LETS_ENCRYPT=" /var/deploy/mailcow/mailcow.conf | cut -d= -f2)
if [ "$skipLe" = "y" ]; then
checkStatus "OK" "Mailcow ACME disabled (SKIP_LETS_ENCRYPT=y)"
return 0
fi
local acmeErrors
acmeErrors=$(cd /var/deploy/mailcow && docker compose logs acme-mailcow --tail 50 2>&1 | grep -i "HTTP validation failed" | wc -l)
if [ "$acmeErrors" -gt 0 ]; then
warnStatus "Found ${acmeErrors} HTTP validation failures in mailcow ACME logs"
warnStatus "Expected if autodiscover/autoconfig subdomains are handled by Traefik"
else
checkStatus "OK" "No HTTP validation failures in mailcow ACME logs"
fi
}
checkMailSecurity() {
printSection "5. Mail Security (DANE/DNSSEC/MTA-STS)"
if [ -z "$mailcowHostname" ]; then
warnStatus "MAILCOW_HOSTNAME not found, skipping mail security checks"
return 1
fi
local domainPart
domainPart=$(echo "$mailcowHostname" | cut -d. -f2-)
# Check DNSSEC.
echo -e "${yellow}DNSSEC Status:${noColor}"
local dsRecords
local dnskeyRecords
dsRecords=$(dig +short DS "$domainPart" 2>/dev/null | wc -l)
dnskeyRecords=$(dig +short DNSKEY "$domainPart" 2>/dev/null | wc -l)
if [ "$dsRecords" -gt 0 ] && [ "$dnskeyRecords" -gt 0 ]; then
checkStatus "OK" "DNSSEC enabled (${dsRecords} DS, ${dnskeyRecords} DNSKEY records)"
else
warnStatus "DNSSEC not fully active (DS: ${dsRecords}, DNSKEY: ${dnskeyRecords})"
fi
# Check TLSA/DANE records.
echo -e "${yellow}TLSA/DANE Records:${noColor}"
local tlsa25
local tlsa465
local tlsa587
tlsa25=$(dig +short TLSA "_25._tcp.${mailcowHostname}" 2>/dev/null | wc -l)
tlsa465=$(dig +short TLSA "_465._tcp.${mailcowHostname}" 2>/dev/null | wc -l)
tlsa587=$(dig +short TLSA "_587._tcp.${mailcowHostname}" 2>/dev/null | wc -l)
if [ "$tlsa25" -gt 0 ]; then
checkStatus "OK" "TLSA record for port 25 (SMTP)"
else
warnStatus "TLSA record missing for port 25"
fi
if [ "$tlsa465" -gt 0 ]; then
checkStatus "OK" "TLSA record for port 465 (SMTPS)"
else
warnStatus "TLSA record missing for port 465"
fi
if [ "$tlsa587" -gt 0 ]; then
checkStatus "OK" "TLSA record for port 587 (Submission)"
else
warnStatus "TLSA record missing for port 587"
fi
# Check MTA-STS.
echo -e "${yellow}MTA-STS:${noColor}"
local mtaStsDns
local mtaStsPolicy
mtaStsDns=$(dig +short TXT "_mta-sts.${domainPart}" 2>/dev/null | grep -q "STSv1" && echo "OK" || echo "FAIL")
mtaStsPolicy=$(curl -sk "https://mta-sts.${domainPart}/.well-known/mta-sts.txt" 2>/dev/null | grep -q "version: STSv1" && echo "OK" || echo "FAIL")
checkStatus "$mtaStsDns" "MTA-STS DNS record"
checkStatus "$mtaStsPolicy" "MTA-STS policy file accessible"
# Check TLS-RPT.
echo -e "${yellow}TLS-RPT:${noColor}"
local tlsRpt
tlsRpt=$(dig +short TXT "_smtp._tls.${domainPart}" 2>/dev/null | grep -q "TLSRPTv1" && echo "OK" || echo "FAIL")
checkStatus "$tlsRpt" "TLS-RPT DNS record"
# Check DKIM.
echo -e "${yellow}DKIM:${noColor}"
local dkimRecord
dkimRecord=$(dig +short TXT "default._domainkey.${domainPart}" 2>/dev/null | grep -q "DKIM1" && echo "OK" || echo "FAIL")
checkStatus "$dkimRecord" "DKIM DNS record"
# Check SPF.
echo -e "${yellow}SPF:${noColor}"
local spfRecord
spfRecord=$(dig +short TXT "$domainPart" 2>/dev/null | grep -q "spf1" && echo "OK" || echo "FAIL")
checkStatus "$spfRecord" "SPF DNS record"
# Check DMARC.
echo -e "${yellow}DMARC:${noColor}"
local dmarcRecord
dmarcRecord=$(dig +short TXT "_dmarc.${domainPart}" 2>/dev/null | grep -q "DMARC1" && echo "OK" || echo "FAIL")
checkStatus "$dmarcRecord" "DMARC DNS record"
# Check reverse DNS (PTR).
echo -e "${yellow}Reverse DNS (PTR):${noColor}"
local ipv4Addr
local ipv6Addr
ipv4Addr=$(dig +short A "$mailcowHostname" 2>/dev/null | head -1)
ipv6Addr=$(dig +short AAAA "$mailcowHostname" 2>/dev/null | head -1)
if [ -n "$ipv4Addr" ]; then
local ptr4
ptr4=$(dig +short -x "$ipv4Addr" 2>/dev/null)
if echo "$ptr4" | grep -q "$mailcowHostname"; then
checkStatus "OK" "IPv4 PTR record points to ${mailcowHostname}"
else
warnStatus "IPv4 PTR record: ${ptr4:-not found}"
fi
fi
if [ -n "$ipv6Addr" ]; then
local ptr6
ptr6=$(dig +short -x "$ipv6Addr" 2>/dev/null)
if echo "$ptr6" | grep -q "$mailcowHostname"; then
checkStatus "OK" "IPv6 PTR record points to ${mailcowHostname}"
else
warnStatus "IPv6 PTR record: ${ptr6:-not found}"
fi
fi
}
printSummary() {
printSection "Summary"
echo -e "${green}${noColor} Services: Most services are running"
echo -e "${green}${noColor} SSL/TLS: Certificates are valid and properly configured"
echo -e "${green}${noColor} Mailcow: Configuration appears correct"
echo ""
echo -e "${blue}Health check completed.${noColor}"
}
echo -e "${blue}========================================${noColor}"
echo -e "${blue} Comprehensive Diagnostics Report${noColor}"
echo -e "${blue}========================================${noColor}"
echo -e "${yellow}Checking Docker and Docker Compose installation...${noColor}"
if command -v docker >/dev/null 2>&1; then
echo -e "${green}Docker is installed: $(docker --version)${noColor}"
else
echo -e "${red}Docker is not installed.${noColor}"
exit 1
fi
if docker compose version >/dev/null 2>&1; then
echo -e "${green}Docker Compose plugin is installed: $(docker compose version)${noColor}"
else
echo -e "${red}Docker Compose plugin is not installed.${noColor}"
exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
warnStatus "jq not found, some checks will be skipped"
fi
echo -e "${yellow}Checking system resources...${noColor}"
echo -e "${yellow}CPU:${noColor} $(grep -c processor /proc/cpuinfo) cores"
echo -e "${yellow}Memory:${noColor} $(free -h | grep Mem | awk '{print $2}')"
echo -e "${yellow}Disk space:${noColor} $(df -h / | awk 'NR==2 {print $2}')"
loadCoreDomain
loadMailcowHostname
printSection "Core Service Diagnostics"
checkTraefik
echo ""
checkForgejo
echo ""
checkDatabases
echo ""
checkNextcloud
echo ""
checkAllServices
echo ""
checkMailcowServices
checkSslAndCerts
checkMailcowConfig
checkAcmeLogs
checkMailSecurity
printSummary