From 0c59eff0e87e844a313ca8feda13227f63b4cd98 Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Tue, 6 Sep 2022 11:37:28 -0700 Subject: [PATCH] Retry apt tasks in zuul_reboot if apt lock is held The zuul_reboot playbook runs on each zuul server at what essentially become random times based on how long the previous servers took to be updated. We have seen this result in our apt tasks colliding with unattended upgrades on the server. Latest ansible would let us workaround this using the lock_timeout parameter to the apt module, but the version we use on bridge does not support this parameter. Instead we check the failure message for 'Failed to lock apt for exclusive operation' and if present we retry. We wait 30 seconds between retries and will perform up to 40 attempts for a total of 20 minutes of waiting. This method should also be forward compatibile with new Ansible. If the lock is held for longer than 20 minutes it likely implies something has gone wrong and we will need to perform manual intervention anyway. Change-Id: I3171838a30e3ea496bb08f8b6ab1c95755b2ff3c --- playbooks/zuul_reboot.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/playbooks/zuul_reboot.yaml b/playbooks/zuul_reboot.yaml index fa5b334469..db14461c5c 100644 --- a/playbooks/zuul_reboot.yaml +++ b/playbooks/zuul_reboot.yaml @@ -18,6 +18,11 @@ apt: update_cache: yes upgrade: yes + register: apt_action + # 20 minute wait for unattended-upgrades to complete + delay: 30 + retries: 40 + until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg - name: Reboot the executor server reboot: - name: Start the executor @@ -37,6 +42,11 @@ apt: update_cache: yes upgrade: yes + register: apt_action + # 20 minute wait for unattended-upgrades to complete + delay: 30 + retries: 40 + until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg - name: Reboot the merger server reboot: - name: Start the merger @@ -62,6 +72,11 @@ apt: update_cache: yes upgrade: yes + register: apt_action + # 20 minute wait for unattended-upgrades to complete + delay: 30 + retries: 40 + until: apt_action is success or 'Failed to lock apt for exclusive operation' not in apt_action.msg - name: Reboot the scheduler server reboot: - name: Start the scheduler process