Add playbook to gracefully stop and reboot the zuul cluster

This should simplify the process of applying patches to the cluster.

Change-Id: I28756e32c2f42186e11d78e4ca461e808026f632
changes/17/843317/8
Clark Boylan 8 months ago
parent 03a0bef7b6
commit 88425158a1

@ -0,0 +1,97 @@
# TODO We need to add a locking/failsafe check mechanism
# TODO: stop pulling in the hourly job if we do this
- name: "Ensure we are going to restart/reboot on the same image"
import_playbook: zuul_pull.yaml
# TODO Do we want to force disabled servers to be rebooted too?
- hosts: "zuul-executor:!disabled"
name: "Reboot zuul-executors gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the executor
include_role:
name: zuul-executor
tasks_from: graceful
- name: Reboot the executor server
reboot:
- name: Start the executor
include_role:
name: zuul-executor
tasks_from: start
- hosts: "zuul-merger:!disabled"
name: "Reboot zuul-mergers gracefully one at a time"
serial: 1
tasks:
- name: Gracefully stop the merger
include_role:
name: zuul-merger
tasks_from: graceful
- name: Reboot the merger server
reboot:
- name: Start the merger
include_role:
name: zuul-merger
tasks_from: start
# TODO should we do both schedulers with reboots then do the webs without
# reboots?
- hosts: "zuul-scheduler:!disabled"
name: "Reboot zuul-schedulers gracefully one at a time"
serial: 1
tasks:
- name: Stop the scheduler process
include_role:
name: zuul-scheduler
tasks_from: stop
- name: Stop the web processes
include_role:
name: zuul-web
tasks_from: stop
- name: Reboot the scheduler server
reboot:
- name: Start the scheduler process
include_role:
name: zuul-scheduler
tasks_from: start
- name: Start the web processes
include_role:
name: zuul-web
tasks_from: start
- name: Wait for scheduler to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(scheduler_query) | length == 1 and components.content | from_json | json_query(scheduler_query) | first == 'running' }}"
vars:
scheduler_query: "scheduler[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for web to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 3 hours
retries: 360
delay: 30
until: "{{ components.status == 200 and components.content | from_json | json_query(web_query) | length == 1 and components.content | from_json | json_query(web_query) | first == 'running' }}"
vars:
web_query: "web[?hostname=='{{ inventory_hostname }}'].state"
- name: Wait for fingergw to be running
uri:
url: https://zuul.opendev.org/api/components
method: GET
return_content: yes
register: components
# 45 minutes
retries: 180
delay: 15
until: "{{ components.status == 200 and components.content | from_json | json_query(finger_query) | length == 1 and components.content | from_json | json_query(finger_query) | first == 'running' }}"
vars:
finger_query: "fingergw[?hostname=='{{ inventory_hostname }}'].state"
Loading…
Cancel
Save