> "$LOGFILE" } get_vm_state() { openstack server show "$SERVER_ID" -f json | jq -r '."OS-EXT-STS:vm_state" // empty' } get_power_status() { openstack server show "$SERVER_ID" -f json | jq -r '.status // empty' } wait_for_stopped() { local timeout_sec="${1:-900}" interval_sec="${2:-5}" elapsed=0 log "Waiting for VM to stop (up to ${timeout_sec}s)..." while true; do local vm_state status vm_state=$(get_vm_state) status=$(get_power_status) log " vm_state=${vm_state:-unknown}, status=${status:-unknown}" "> > "$LOGFILE" } get_vm_state() { openstack server show "$SERVER_ID" -f json | jq -r '."OS-EXT-STS:vm_state" // empty' } get_power_status() { openstack server show "$SERVER_ID" -f json | jq -r '.status // empty' } wait_for_stopped() { local timeout_sec="${1:-900}" interval_sec="${2:-5}" elapsed=0 log "Waiting for VM to stop (up to ${timeout_sec}s)..." while true; do local vm_state status vm_state=$(get_vm_state) status=$(get_power_status) log " vm_state=${vm_state:-unknown}, status=${status:-unknown}" "> > "$LOGFILE" } get_vm_state() { openstack server show "$SERVER_ID" -f json | jq -r '."OS-EXT-STS:vm_state" // empty' } get_power_status() { openstack server show "$SERVER_ID" -f json | jq -r '.status // empty' } wait_for_stopped() { local timeout_sec="${1:-900}" interval_sec="${2:-5}" elapsed=0 log "Waiting for VM to stop (up to ${timeout_sec}s)..." while true; do local vm_state status vm_state=$(get_vm_state) status=$(get_power_status) log " vm_state=${vm_state:-unknown}, status=${status:-unknown}" ">
#!/usr/bin/env bash
set -euo pipefail
# ============================================================
# OpenStack VM Root Filesystem Repairer (Ceph RBD backend)
# Logs to: /var/log/support-fix-vm-disk.log
# ============================================================
LOGFILE="/var/log/support-fix-vm-disk.log"
# -------------------------
# Helpers
# -------------------------
log() {
local msg="$1"
echo "$msg"
echo "$(date '+%Y-%m-%d %H:%M:%S') $msg" >> "$LOGFILE"
}
get_vm_state() {
openstack server show "$SERVER_ID" -f json | jq -r '."OS-EXT-STS:vm_state" // empty'
}
get_power_status() {
openstack server show "$SERVER_ID" -f json | jq -r '.status // empty'
}
wait_for_stopped() {
local timeout_sec="${1:-900}" interval_sec="${2:-5}" elapsed=0
log "Waiting for VM to stop (up to ${timeout_sec}s)..."
while true; do
local vm_state status
vm_state=$(get_vm_state)
status=$(get_power_status)
log " vm_state=${vm_state:-unknown}, status=${status:-unknown}"
if [[ "$vm_state" == "stopped" ]] || [[ "$status" == "SHUTOFF" ]]; then
log "VM is stopped."
return 0
fi
if (( elapsed >= timeout_sec )); then
log "Error: Timed out waiting for VM to stop."
return 1
fi
sleep "$interval_sec"; ((elapsed+=interval_sec))
done
}
wait_for_active() {
local timeout_sec="${1:-900}" interval_sec="${2:-5}" elapsed=0
log "Waiting for VM to become ACTIVE (up to ${timeout_sec}s)..."
while true; do
local vm_state status
vm_state=$(get_vm_state)
status=$(get_power_status)
log " vm_state=${vm_state:-unknown}, status=${status:-unknown}"
if [[ "$vm_state" == "active" ]] || [[ "$status" == "ACTIVE" ]]; then
log "VM is ACTIVE."
return 0
fi
if (( elapsed >= timeout_sec )); then
log "Error: Timed out waiting for VM to become ACTIVE."
return 1
fi
sleep "$interval_sec"; ((elapsed+=interval_sec))
done
}
map_rbd_image() {
local pool="$1"
local image="$2"
log "Mapping RBD image: pool='${pool}', image='${image}' ..."
if DEV_ID=$(rbd map --pool "$pool" "$image" 2>/dev/null); then
:
else
log "rbd map failed; trying to find existing mapping..."
DEV_ID=$(rbd showmapped | awk -v p="$pool" -v img="$image" '$2==p && $3==img {print $5}' | head -n1 || true)
if [[ -z "$DEV_ID" ]]; then
log "Error: Could not map or find mapped device for ${pool}/${image}."
exit 1
fi
fi
log "-> Mapped device: $DEV_ID"
}
cleanup() {
if [[ -n "${DEV_ID:-}" ]]; then
if rbd showmapped | awk '{print $5}' | grep -qx "$DEV_ID"; then
log "Cleanup: unmapping $DEV_ID"
rbd unmap "$DEV_ID" || true
fi
fi
}
# -------------------------
# Preconditions & setup
# -------------------------
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root." >&2
exit 1
fi
mkdir -p "$(dirname "$LOGFILE")"
touch "$LOGFILE"
chmod 600 "$LOGFILE" || true
DEV_ID=""
SERVER_ID=""
cur_vm_state=""
cur_status=""
trap cleanup EXIT
echo "==========================================="
echo " OpenStack RBD Root Filesystem Repairer"
echo "==========================================="
# -------------------------
# Step 1: List projects
# -------------------------
log "[1/8] Fetching OpenStack projects..."
projects_json=$(openstack project list -f json)
mapfile -t proj_lines < <(echo "$projects_json" | jq -r '.[] | "\\(.Name) \\(.ID)"')
if ((${#proj_lines[@]}==0)); then
log "No projects found."
exit 1
fi
echo
log "Select a project:"
i=1
for line in "${proj_lines[@]}"; do
pname=${line% *}
pid=${line##* }
printf "%2d) %s (%s)\\n" "$i" "$pname" "$pid"
((i++))
done
read -rp "Enter number: " sel
if ! [[ "$sel" =~ ^[0-9]+$ ]] || (( sel < 1 || sel > ${#proj_lines[@]} )); then
log "Invalid project selection."
exit 1
fi
selected_line="${proj_lines[$((sel-1))]}"
PROJECT_NAME=${selected_line% *}
PROJECT_ID=${selected_line##* }
log "-> Project selected: $PROJECT_NAME ($PROJECT_ID)"
# -------------------------
# Step 2: List servers
# -------------------------
log "[2/8] Fetching servers for project..."
servers_json=$(openstack server list --project "$PROJECT_ID" -f json)
mapfile -t srv_lines < <(echo "$servers_json" | jq -r '.[] | "\\(.Name) \\(.ID)"')
if ((${#srv_lines[@]}==0)); then
log "No servers found in this project."
exit 1
fi
echo
log "Select a server:"
i=1
for line in "${srv_lines[@]}"; do
sname=${line% *}
sid=${line##* }
printf "%2d) %s (%s)\\n" "$i" "$sname" "$sid"
((i++))
done
read -rp "Enter number: " ssel
if ! [[ "$ssel" =~ ^[0-9]+$ ]] || (( ssel < 1 || ssel > ${#srv_lines[@]} )); then
log "Invalid server selection."
exit 1
fi
srv_line="${srv_lines[$((ssel-1))]}"
SERVER_NAME=${srv_line% *}
SERVER_ID=${srv_line##* }
log "-> Server selected: $SERVER_NAME ($SERVER_ID)"
# -------------------------
# Step 3: Ensure VM stopped
# -------------------------
log "[3/8] Checking VM state..."
cur_vm_state=$(get_vm_state)
cur_status=$(get_power_status)
log "Current: vm_state=${cur_vm_state:-unknown}, status=${cur_status:-unknown}"
if [[ "$cur_vm_state" != "stopped" && "$cur_status" != "SHUTOFF" ]]; then
log "VM is not stopped."
read -rp 'Type "YES" to stop the VM now: ' STOP_CONFIRM
if [[ "$STOP_CONFIRM" != "YES" ]]; then
log "User canceled stop. Exiting."
exit 1
fi
log "Stopping VM..."
openstack server stop "$SERVER_ID"
wait_for_stopped 900 5
fi
# -------------------------
# Step 4: Inspect boot source & map RBD
# -------------------------
log "[4/8] Inspecting server boot source..."
srv_show_json=$(openstack server show "$SERVER_ID" -f json)
image_field=$(echo "$srv_show_json" | jq -r '.image')
vol_id=$(echo "$srv_show_json" | jq -r '.attached_volumes[0]?.id // empty')
if [[ "$image_field" == "N/A (booted from volume)" ]] && [[ -n "$vol_id" ]]; then
map_rbd_image "cinder-volumes" "volume-$vol_id"
else
map_rbd_image "ephemeral-vms" "${SERVER_ID}_disk"
fi
# -------------------------
# Step 5: Largest partition & filesystem
# -------------------------
log "[5/8] Detecting largest partition..."
PART_ID=$(lsblk -bnro PATH,TYPE,SIZE "$DEV_ID" \\
| awk '$2=="part"{print $1, $3}' \\
| sort -k2 -n | tail -1 | awk '{print $1}')
if [[ -z "$PART_ID" ]]; then
log "Error: No partitions found on $DEV_ID."
exit 1
fi
log "Largest partition: $PART_ID"
lsblk "$PART_ID" | tee -a "$LOGFILE" >/dev/null
lsblk "$PART_ID"
part_type=$(blkid -o value -s TYPE "$PART_ID" || true)
if [[ -z "$part_type" ]]; then
sig=$(file -s "$PART_ID")
case "$sig" in
*"XFS"*) part_type="xfs" ;;
*"ext2 filesystem"*) part_type="ext2" ;;
*"ext3 filesystem"*) part_type="ext3" ;;
*"ext4 filesystem"*) part_type="ext4" ;;
*) part_type="" ;;
esac
fi
if [[ -z "$part_type" ]]; then
log "Warning: Could not auto-detect filesystem. 'file -s' says:"
file -s "$PART_ID" | tee -a "$LOGFILE" >/dev/null
file -s "$PART_ID"
read -rp "Enter filesystem type manually (ext4/xfs/ext3/ext2), or leave blank to abort: " manual_fs
if [[ -z "$manual_fs" ]]; then
log "Aborting per user input."
exit 1
fi
part_type="$manual_fs"
fi
log "Detected filesystem: $part_type"
if grep -qE "[[:space:]]$PART_ID[[:space:]]" /proc/mounts; then
log "Error: $PART_ID appears to be mounted. Unmount before repair."
exit 1
fi
# -------------------------
# Step 6: Confirm & repair
# -------------------------
log "[6/8] Confirm repair"
log "This will attempt to repair filesystem on: $PART_ID ($part_type)"
read -rp 'Type "YES" to proceed: ' CONFIRM
if [[ "$CONFIRM" != "YES" ]]; then
log "User canceled repair. Exiting."
exit 1
fi
log "Repairing filesystem on $PART_ID..."
case "$part_type" in
ext4|ext3|ext2)
fsck -y "$PART_ID" | tee -a "$LOGFILE" >/dev/null
fsck -y "$PART_ID"
;;
xfs)
if ! command -v xfs_repair &>/dev/null; then
log "Error: xfs_repair not available."
exit 1
fi
xfs_repair -L "$PART_ID" | tee -a "$LOGFILE" >/dev/null
xfs_repair -L "$PART_ID"
;;
*)
log "Error: Unsupported or unknown filesystem '$part_type'."
exit 1
;;
esac
log "Filesystem repair completed."
# -------------------------
# Step 7: Unmap RBD
# -------------------------
log "[7/8] Unmapping RBD..."
rbd unmap "$DEV_ID" || true
DEV_ID=""
log "RBD unmapped."
# -------------------------
# Step 8: Optionally start VM (with polling)
# -------------------------
echo
log "[8/8] Previous VM status was: vm_state=${cur_vm_state:-unknown}, status=${cur_status:-unknown}"
read -rp 'Do you want to start the VM again? Type "YES" to confirm: ' START_CONFIRM
if [[ "$START_CONFIRM" == "YES" ]]; then
log "User confirmed start. Starting VM..."
openstack server start "$SERVER_ID"
wait_for_active 900 5
log "Start sequence finished."
else
log "User declined start. Leaving VM stopped."
fi
log "Done."
===========================================
OpenStack RBD Root Filesystem Repairer
===========================================
[1/8] Fetching OpenStack projects...
Select a project:
1) app-framework (2a75b1b104254f94acaf99d42cf066b5)
2) vdi (376ca768c2a14e4f81fcd1e8957f7e42)
3) service (3ce1bbcbfec94a7f9490ed355da1b727)
4) admin (5d979b86ab274fd496c7c916754adf0d)
5) proj001 (919bfad4b8fc426ba7650aed33d00825)
6) demo (b5f1291a5591450bbc753315b2c9635e)
7) ws6 (d5adda2bc1604f4f8273b63849f1790a)
Enter number: 6
-> Project selected: demo (b5f1291a5591450bbc753315b2c9635e)
[2/8] Fetching servers for project...
Select a server:
1) u22 (1287f3de-493d-4854-b1c5-df6bc9ac706c)
2) r10 (fa58f3d2-bff5-480f-abf8-b1214d8af593)
3) r8 (1302da7e-5852-4203-850e-65207544e06b)
4) r9 (82fe8f2f-d65b-4fe1-9364-e8ee7892a3c0)
5) win11zh (18c1f6e4-8615-450f-a9e4-fe2d5bfc51f7)
6) xpe (6682c310-ca62-4101-8558-ac60988c9269)
7) centos_9builder (24336dbb-b7d6-40fe-8509-58849c1d9e22)
8) builder (52e1c0b1-2c76-43c0-b055-9762b17e7832)
Enter number: 2
-> Server selected: r10 (fa58f3d2-bff5-480f-abf8-b1214d8af593)
[3/8] Checking VM state...
Current: vm_state=active, status=ACTIVE
VM is not stopped.
Type "YES" to stop the VM now: YES
Stopping VM...
Waiting for VM to stop (up to 900s)...
vm_state=stopped, status=SHUTOFF
VM is stopped.
[4/8] Inspecting server boot source...
Mapping RBD image: pool='cinder-volumes', image='volume-930ac86d-bc5e-4473-ac4e-655e6e3c9abf' ...
-> Mapped device: /dev/rbd5
[5/8] Detecting largest partition...
Largest partition: /dev/rbd5p4
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
rbd5p4 252:84 0 78.8G 0 part
Detected filesystem: xfs
[6/8] Confirm repair
This will attempt to repair filesystem on: /dev/rbd5p4 (xfs)
Type "YES" to proceed: YES
Repairing filesystem on /dev/rbd5p4...
Phase 1 - find and verify superblock...
- reporting progress in intervals of 15 minutes
Phase 2 - using internal log
- zero log...
- 11:15:10: zeroing log - 115425 of 115425 blocks done
- scan filesystem freespace and inode maps...
clearing needsrepair flag and regenerating metadata
- 11:15:10: scanning filesystem freespace - 36 of 36 allocation groups done
- found root inode chunk
Phase 3 - for each AG...
- scan and clear agi unlinked lists...
- 11:15:10: scanning agi unlinked lists - 36 of 36 allocation groups done
- process known inodes and perform inode discovery...
- agno = 0
- agno = 30
- agno = 15
- agno = 31
- agno = 16
- agno = 32
- agno = 33
- agno = 34
- agno = 17
- agno = 35
- agno = 18
- agno = 19
- agno = 20
- agno = 21
- agno = 22
- agno = 23
- agno = 24
- agno = 25
- agno = 26
- agno = 27
- agno = 28
- agno = 29
- agno = 1
- agno = 2
- agno = 3
- agno = 4
- agno = 5
- agno = 6
- agno = 7
- agno = 8
- agno = 9
- agno = 10
- agno = 11
- agno = 12
- agno = 13
- agno = 14
- 11:15:10: process known inodes and inode discovery - 36160 of 36160 inodes done
- process newly discovered inodes...
- 11:15:10: process newly discovered inodes - 36 of 36 allocation groups done
Phase 4 - check for duplicate blocks...
- setting up duplicate extent list...
- 11:15:10: setting up duplicate extent list - 36 of 36 allocation groups done
- check for inodes claiming duplicate blocks...
- agno = 1
- agno = 3
- agno = 8
- agno = 20
- agno = 6
- agno = 31
- agno = 7
- agno = 9
- agno = 4
- agno = 10
- agno = 11
- agno = 14
- agno = 22
- agno = 16
- agno = 0
- agno = 13
- agno = 18
- agno = 19
- agno = 17
- agno = 12
- agno = 21
- agno = 28
- agno = 24
- agno = 23
- agno = 25
- agno = 27
- agno = 30
- agno = 26
- agno = 29
- agno = 2
- agno = 5
- agno = 15
- agno = 34
- agno = 32
- agno = 35
- agno = 33
clearing reflink flag on inodes when possible
- 11:15:10: check for inodes claiming duplicate blocks - 36160 of 36160 inodes done
Phase 5 - rebuild AG headers and trees...
- 11:15:11: rebuild AG headers and trees - 36 of 36 allocation groups done
- reset superblock...
Phase 6 - check inode connectivity...
- resetting contents of realtime bitmap and summary inodes
- traversing filesystem ...
- traversal finished ...
- moving disconnected inodes to lost+found ...
Phase 7 - verify and correct link counts...
- 11:15:11: verify and correct link counts - 36 of 36 allocation groups done
Maximum metadata LSN (1:55699) is ahead of log (1:2).
Format log to cycle 4.
done
Phase 1 - find and verify superblock...
- reporting progress in intervals of 15 minutes
Phase 2 - using internal log
- zero log...
- 11:15:14: zeroing log - 115425 of 115425 blocks done
- scan filesystem freespace and inode maps...
clearing needsrepair flag and regenerating metadata
- 11:15:14: scanning filesystem freespace - 36 of 36 allocation groups done
- found root inode chunk
Phase 3 - for each AG...
- scan and clear agi unlinked lists...
- 11:15:14: scanning agi unlinked lists - 36 of 36 allocation groups done
- process known inodes and perform inode discovery...
- agno = 0
- agno = 30
- agno = 15
- agno = 31
- agno = 16
- agno = 32
- agno = 33
- agno = 34
- agno = 17
- agno = 35
- agno = 18
- agno = 19
- agno = 20
- agno = 21
- agno = 22
- agno = 23
- agno = 24
- agno = 25
- agno = 26
- agno = 27
- agno = 28
- agno = 29
- agno = 1
- agno = 2
- agno = 3
- agno = 4
- agno = 5
- agno = 6
- agno = 7
- agno = 8
- agno = 9
- agno = 10
- agno = 11
- agno = 12
- agno = 13
- agno = 14
- 11:15:14: process known inodes and inode discovery - 36160 of 36160 inodes done
- process newly discovered inodes...
- 11:15:14: process newly discovered inodes - 36 of 36 allocation groups done
Phase 4 - check for duplicate blocks...
- setting up duplicate extent list...
- 11:15:14: setting up duplicate extent list - 36 of 36 allocation groups done
- check for inodes claiming duplicate blocks...
- agno = 0
- agno = 1
- agno = 2
- agno = 8
- agno = 10
- agno = 17
- agno = 5
- agno = 6
- agno = 7
- agno = 22
- agno = 9
- agno = 12
- agno = 15
- agno = 18
- agno = 3
- agno = 29
- agno = 16
- agno = 24
- agno = 25
- agno = 26
- agno = 35
- agno = 4
- agno = 23
- agno = 13
- agno = 20
- agno = 27
- agno = 28
- agno = 32
- agno = 21
- agno = 19
- agno = 11
- agno = 14
- agno = 30
- agno = 31
- agno = 33
- agno = 34
- 11:15:14: check for inodes claiming duplicate blocks - 36160 of 36160 inodes done
Phase 5 - rebuild AG headers and trees...
- 11:15:15: rebuild AG headers and trees - 36 of 36 allocation groups done
- reset superblock...
Phase 6 - check inode connectivity...
- resetting contents of realtime bitmap and summary inodes
- traversing filesystem ...
- traversal finished ...
- moving disconnected inodes to lost+found ...
Phase 7 - verify and correct link counts...
- 11:15:15: verify and correct link counts - 36 of 36 allocation groups done
Maximum metadata LSN (1:55699) is ahead of log (1:2).
Format log to cycle 4.
done
Filesystem repair completed.
[7/8] Unmapping RBD...
RBD unmapped.
[8/8] Previous VM status was: vm_state=active, status=ACTIVE
Do you want to start the VM again? Type "YES" to confirm:
User declined start. Leaving VM stopped.
Done.