feat(deploy): manifest-driven config, teardown + db-perms, hardening

deploy.sh:
- never rsync into /; stage to /tmp on the remote and install at final
  paths via sudo bash heredoc, closing the parent-dir attribute leak
  that broke three hosts in the earlier rsync incident
- shell-quote heredoc args via ${var@Q}
- drop -A -X on the remaining (web) rsyncs
- generic worker.secrets loop reads (env-var → pass path) from manifest;
  GITEA_TOKEN now flows through automatically
- in-memory bash substitution for templates (secrets never on argv)
- simplify semanage port labelling: --add 2>/dev/null || --modify (the
  old grep pre-check matched only the first listed port)
- restorecon back to short flags (Fedora policycoreutils has no long
  forms; --recursive errored at deploy time)
- quieter health probe loop: curl diagnostics only on final failure

manifest as source of truth:
- api.config.bind drives BIND_ADDR, firewalld port, semanage label,
  health-probe URL
- web.config.{server_name,root,api_upstream} drives nginx render,
  rsync targets, restorecon scope
- nginx config renamed to site.conf.tmpl; firewalld svc to
  moments-api.xml.tmpl; both rendered at deploy time
- topology flip: api → nikola, worker → frootmig (anjie freed)

new scripts:
- script/teardown.sh: idempotent component teardown, never rsyncs,
  shared-state cleanup gated on absence of remaining env files,
  --remove-docroot guard against shallow / system paths
- script/db-perms.sh: rewritten — fixes grep/append role mismatch that
  appended duplicates on re-run, adds postgres reload, hits primary +
  standby in a single invocation

readme: genericized; deployment topology no longer carries real host
or site names.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 16:39:10 +03:00
parent f30f949895
commit 8867ff5df3
9 changed files with 643 additions and 181 deletions

288
script/teardown.sh Executable file
View File

@@ -0,0 +1,288 @@
#!/usr/bin/env bash
#
# moments teardown script.
#
# ./script/teardown.sh <environment> <host> [component...] [--dry-run]
# ./script/teardown.sh prod anjie.kosherinata.internal api worker
# ./script/teardown.sh prod oolon.kosherinata.internal web --remove-docroot
# ./script/teardown.sh prod anjie.kosherinata.internal all --dry-run
#
# Removes moments unit files, binaries, env files, firewalld service +
# definition, SELinux port label, and (when no moments component env files
# remain) the shared /etc/moments + /var/lib/moments dirs and the sysusers
# entry. Idempotent — safe to re-run.
#
# Notes:
# - The host argument is explicit on purpose: you typically tear down on
# hosts you've already removed from manifest.components.<c>.hosts.
# - Manifest is still read for env-wide config (api port, server_name,
# docroot path), so $environment must still resolve.
# - The `moments` user/group is intentionally NOT removed: any leftover
# file owned by it would become orphan-owned. Run `userdel moments`
# manually if you're certain there are none.
# - Web docroot is left intact unless --remove-docroot is given.
set -euo pipefail
shopt -s nullglob
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
manifest="${repo_root}/asset/manifest.yml"
dry_run=0
remove_docroot=0
usage() {
cat <<EOF >&2
usage: $(basename "$0") <environment> <host> [component...] [--dry-run] [--remove-docroot]
$(basename "$0") prod anjie.kosherinata.internal api worker
$(basename "$0") prod oolon.kosherinata.internal web --remove-docroot
$(basename "$0") prod anjie.kosherinata.internal all
components: api | worker | web | all
EOF
exit 2
}
log() { printf '\033[1;34m[teardown]\033[0m %s\n' "$*" >&2; }
warn() { printf '\033[1;33m[teardown]\033[0m %s\n' "$*" >&2; }
die() { printf '\033[1;31m[teardown]\033[0m %s\n' "$*" >&2; exit 1; }
ssh_run() {
local host="$1"; shift
if (( dry_run )); then
printf '\033[2m[dry-run]\033[0m ssh %s -- %s\n' "$host" "$*" >&2
else
ssh -o BatchMode=yes "$host" "$@"
fi
}
[[ $# -ge 2 ]] || usage
environment="$1"; shift
target_host="$1"; shift
components=()
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run) dry_run=1 ;;
--remove-docroot) remove_docroot=1 ;;
*) components+=("$1") ;;
esac
shift
done
[[ -f "$manifest" ]] || die "manifest not found: $manifest"
command -v yq >/dev/null 2>&1 || die "yq is required"
env_path=".environments.${environment}"
yq --exit-status "${env_path}" "$manifest" >/dev/null \
|| die "environment '$environment' not found in manifest"
if [[ ${#components[@]} -eq 0 ]]; then
usage
fi
if [[ "${components[0]:-}" == "all" ]]; then
components=(api worker web)
fi
teardown_api() {
local host="$1"
log "api -> $host"
local bind api_port=""
bind="$(yq --raw-output "${env_path}.components.api.config.bind" "$manifest")"
if [[ -n "$bind" && "$bind" != "null" && "$bind" == *:* ]]; then
api_port="${bind##*:}"
[[ "$api_port" =~ ^[0-9]+$ ]] || api_port=""
fi
if (( dry_run )); then
printf '\033[2m[dry-run]\033[0m stop+disable moments-api units, remove unit files, /etc/moments/api.env, /usr/local/bin/moments-api, firewalld svc moments-api, SELinux label tcp/%s on %s\n' \
"${api_port:-<unknown>}" "$host" >&2
return 0
fi
ssh_run "$host" "sudo bash -s -- ${api_port@Q}" <<'REMOTE_EOF'
set -euo pipefail
api_port="$1"
# Stop + disable units. `disable --now` quietly does nothing on a unit that
# isn't loaded, but emits non-zero exit on some systemd versions when the
# file is already gone — swallow that so re-runs are clean.
for unit in moments-api.service moments-api-cert.path moments-api-cert-reload.service; do
systemctl disable --now "$unit" 2>/dev/null || true
done
rm --force \
/etc/systemd/system/moments-api.service \
/etc/systemd/system/moments-api-cert.path \
/etc/systemd/system/moments-api-cert-reload.service
systemctl daemon-reload
rm --force /etc/moments/api.env /usr/local/bin/moments-api
# Firewalld: remove service from default zone, then drop service definition.
zone="$(firewall-cmd --get-default-zone)"
if firewall-cmd --zone="$zone" --query-service=moments-api >/dev/null 2>&1; then
firewall-cmd --permanent --zone="$zone" --remove-service=moments-api
firewall-cmd --zone="$zone" --remove-service=moments-api 2>/dev/null || true
fi
rm --force /etc/firewalld/services/moments-api.xml
firewall-cmd --reload
# SELinux: remove the port label, if we know which port. --delete fails when
# the port wasn't user-labelled — that's fine, swallow it.
if [[ -n "$api_port" ]]; then
semanage port --delete --proto=tcp "$api_port" 2>/dev/null || true
fi
echo "moments-api torn down"
REMOTE_EOF
}
teardown_worker() {
local host="$1"
log "worker -> $host"
if (( dry_run )); then
printf '\033[2m[dry-run]\033[0m stop+disable moments-worker units, remove unit files, /etc/moments/worker.env, /usr/local/bin/moments-worker on %s\n' \
"$host" >&2
return 0
fi
ssh_run "$host" "sudo bash -s" <<'REMOTE_EOF'
set -euo pipefail
for unit in moments-worker.service moments-worker-cert.path moments-worker-cert-reload.service; do
systemctl disable --now "$unit" 2>/dev/null || true
done
rm --force \
/etc/systemd/system/moments-worker.service \
/etc/systemd/system/moments-worker-cert.path \
/etc/systemd/system/moments-worker-cert-reload.service
systemctl daemon-reload
rm --force /etc/moments/worker.env /usr/local/bin/moments-worker
echo "moments-worker torn down"
REMOTE_EOF
}
teardown_web() {
local host="$1"
log "web -> $host"
local server_name web_root
server_name="$(yq --raw-output "${env_path}.components.web.config.server_name" "$manifest")"
web_root="$(yq --raw-output "${env_path}.components.web.config.root" "$manifest")"
[[ -n "$server_name" && "$server_name" != "null" ]] || die "web.config.server_name missing in manifest"
[[ -n "$web_root" && "$web_root" != "null" ]] || die "web.config.root missing in manifest"
[[ "$web_root" == /* ]] || die "web.config.root must be an absolute path: '$web_root'"
# Refuse to recursively remove a shallow or system path even if the
# manifest says so.
if (( remove_docroot )); then
case "$web_root" in
/|/bin|/bin/*|/boot|/boot/*|/dev|/dev/*|/etc|/etc/*|/home|/home/*|/lib|/lib/*|/lib64|/lib64/*|/proc|/proc/*|/root|/root/*|/run|/run/*|/sbin|/sbin/*|/srv|/srv/*|/sys|/sys/*|/tmp|/tmp/*|/usr|/usr/*|/var|/var/lib|/var/log|/var/run|/var/spool|/var/www)
die "refusing to recursively remove a system path: '$web_root'"
;;
esac
# Require at least three path components (e.g. /var/www/<site>) to
# rule out things like /opt or /srv directly.
[[ "$web_root" =~ ^/[^/]+/[^/]+/[^/]+ ]] \
|| die "refusing to recursively remove a path with fewer than 3 components: '$web_root'"
fi
local site_conf_path="/etc/nginx/conf.d/${server_name}.conf"
if (( dry_run )); then
if (( remove_docroot )); then
printf '\033[2m[dry-run]\033[0m remove %s, recursively remove %s, nginx -t/reload on %s\n' \
"$site_conf_path" "$web_root" "$host" >&2
else
printf '\033[2m[dry-run]\033[0m remove %s, nginx -t/reload on %s (docroot %s left intact; pass --remove-docroot to also clear it)\n' \
"$site_conf_path" "$host" "$web_root" >&2
fi
return 0
fi
ssh_run "$host" "sudo bash -s -- ${site_conf_path@Q} ${web_root@Q} ${remove_docroot@Q}" <<'REMOTE_EOF'
set -euo pipefail
site_conf_path="$1"
web_root="$2"
remove_docroot="$3"
rm --force "$site_conf_path"
if nginx -t 2>&1; then
systemctl reload nginx
echo "nginx reloaded without ${site_conf_path}"
else
echo "nginx -t failed AFTER removing ${site_conf_path}; check other site configs" >&2
exit 1
fi
if [[ "$remove_docroot" == "1" && -d "$web_root" ]]; then
rm --recursive --force "$web_root"
echo "removed docroot ${web_root}"
fi
REMOTE_EOF
}
teardown_shared() {
local host="$1"
log "shared (post-component cleanup) -> $host"
if (( dry_run )); then
printf '\033[2m[dry-run]\033[0m if no api.env/worker.env remain: remove /etc/sysusers.d/moments.conf and rmdir /etc/moments + /var/lib/moments on %s (moments user left in place)\n' \
"$host" >&2
return 0
fi
ssh_run "$host" "sudo bash -s" <<'REMOTE_EOF'
set -euo pipefail
# If any component env still exists, leave shared state alone — another
# moments component is still using /etc/moments and the moments user.
if [[ -e /etc/moments/api.env || -e /etc/moments/worker.env ]]; then
echo "moments env files still present; leaving /etc/moments + /var/lib/moments + sysusers entry in place"
exit 0
fi
# rmdir refuses non-empty dirs — defensive against unknown stragglers.
rmdir /etc/moments 2>/dev/null || true
rmdir /var/lib/moments 2>/dev/null || true
rm --force /etc/sysusers.d/moments.conf
echo "shared state cleared (where empty); moments user/group intentionally left in place"
REMOTE_EOF
}
# Dispatch ------------------------------------------------------------------
failed=()
did_app=0
for component in "${components[@]}"; do
case "$component" in
api) teardown_api "$target_host" || failed+=("api@$target_host") ;;
worker) teardown_worker "$target_host" || failed+=("worker@$target_host") ;;
web) teardown_web "$target_host" || failed+=("web@$target_host") ;;
*) warn "unknown component: $component" ;;
esac
case "$component" in
api|worker) did_app=1 ;;
esac
done
# Shared cleanup runs after api/worker teardown. It's a no-op if either
# component still has its env file present on the host.
if (( did_app )); then
teardown_shared "$target_host" || failed+=("shared@$target_host")
fi
if [[ ${#failed[@]} -gt 0 ]]; then
die "failed: ${failed[*]}"
fi
log "teardown complete on $target_host"