Merge pull request 'Fixes for restoring potentially failed services' (#57) from fix-restores-wrt-service-status into master
continuous-integration/drone/push Build is failing Details

Reviewed-on: #57
pull/58/head
Inex Code 2023-08-25 19:29:02 +03:00
commit 5fd4daa3e7
4 changed files with 71 additions and 19 deletions

View File

@ -283,7 +283,7 @@ class Backups:
Backups._store_last_snapshot(tag, snapshot)
service.post_restore()
except Exception as error:
Jobs.update(job, status=JobStatus.ERROR)
Jobs.update(job, status=JobStatus.ERROR, status_text=str(error))
raise error
Jobs.update(job, status=JobStatus.FINISHED)
@ -306,9 +306,14 @@ class Backups:
snapshot: Snapshot,
job: Job,
) -> None:
Jobs.update(
job, status=JobStatus.CREATED, status_text=f"Waiting for pre-restore backup"
)
failsafe_snapshot = Backups.back_up(service)
Jobs.update(job, status=JobStatus.RUNNING)
Jobs.update(
job, status=JobStatus.RUNNING, status_text=f"Restoring from {snapshot.id}"
)
try:
Backups._restore_service_from_snapshot(
service,
@ -316,9 +321,19 @@ class Backups:
verify=False,
)
except Exception as error:
Jobs.update(
job,
status=JobStatus.ERROR,
status_text=f"Restore failed with {str(error)}, reverting to {failsafe_snapshot.id}",
)
Backups._restore_service_from_snapshot(
service, failsafe_snapshot.id, verify=False
)
Jobs.update(
job,
status=JobStatus.ERROR,
status_text=f"Restore failed with {str(error)}, reverted to {failsafe_snapshot.id}",
)
raise error
@staticmethod
@ -335,20 +350,33 @@ class Backups:
try:
Backups._assert_restorable(snapshot)
Jobs.update(
job, status=JobStatus.RUNNING, status_text="Stopping the service"
)
with StoppedService(service):
Backups.assert_dead(service)
if strategy == RestoreStrategy.INPLACE:
Backups._inplace_restore(service, snapshot, job)
else: # verify_before_download is our default
Jobs.update(job, status=JobStatus.RUNNING)
Jobs.update(
job,
status=JobStatus.RUNNING,
status_text=f"Restoring from {snapshot.id}",
)
Backups._restore_service_from_snapshot(
service, snapshot.id, verify=True
)
service.post_restore()
Jobs.update(
job,
status=JobStatus.RUNNING,
progress=90,
status_text="Restarting the service",
)
except Exception as error:
Jobs.update(job, status=JobStatus.ERROR)
Jobs.update(job, status=JobStatus.ERROR, status_text=str(error))
raise error
Jobs.update(job, status=JobStatus.FINISHED)

View File

@ -13,7 +13,7 @@ from selfprivacy_api.services.owned_path import OwnedPath
from selfprivacy_api import utils
from selfprivacy_api.utils.waitloop import wait_until_true
DEFAULT_START_STOP_TIMEOUT = 10 * 60
DEFAULT_START_STOP_TIMEOUT = 5 * 60
class ServiceStatus(Enum):
@ -283,18 +283,28 @@ class StoppedService:
def __enter__(self) -> Service:
self.original_status = self.service.get_status()
if self.original_status != ServiceStatus.INACTIVE:
self.service.stop()
wait_until_true(
lambda: self.service.get_status() == ServiceStatus.INACTIVE,
timeout_sec=DEFAULT_START_STOP_TIMEOUT,
)
if self.original_status not in [ServiceStatus.INACTIVE, ServiceStatus.FAILED]:
try:
self.service.stop()
wait_until_true(
lambda: self.service.get_status() == ServiceStatus.INACTIVE,
timeout_sec=DEFAULT_START_STOP_TIMEOUT,
)
except TimeoutError as error:
raise TimeoutError(
f"timed out waiting for {self.service.get_display_name()} to stop"
) from error
return self.service
def __exit__(self, type, value, traceback):
if self.original_status in [ServiceStatus.ACTIVATING, ServiceStatus.ACTIVE]:
self.service.start()
wait_until_true(
lambda: self.service.get_status() == ServiceStatus.ACTIVE,
timeout_sec=DEFAULT_START_STOP_TIMEOUT,
)
try:
self.service.start()
wait_until_true(
lambda: self.service.get_status() == ServiceStatus.ACTIVE,
timeout_sec=DEFAULT_START_STOP_TIMEOUT,
)
except TimeoutError as error:
raise TimeoutError(
f"timed out waiting for {self.service.get_display_name()} to start"
) from error

View File

@ -135,8 +135,12 @@ class DummyService(Service):
@classmethod
def stop(cls):
cls.set_status(ServiceStatus.DEACTIVATING)
cls.change_status_with_async_delay(ServiceStatus.INACTIVE, cls.startstop_delay)
# simulate a failing service unable to stop
if not cls.get_status() == ServiceStatus.FAILED:
cls.set_status(ServiceStatus.DEACTIVATING)
cls.change_status_with_async_delay(
ServiceStatus.INACTIVE, cls.startstop_delay
)
@classmethod
def start(cls):

View File

@ -12,6 +12,7 @@ import tempfile
import selfprivacy_api.services as services
from selfprivacy_api.services import Service, get_all_services
from selfprivacy_api.services.service import ServiceStatus
from selfprivacy_api.services import get_service_by_id
from selfprivacy_api.services.test_service import DummyService
@ -464,10 +465,19 @@ def restore_strategy(request) -> RestoreStrategy:
return RestoreStrategy.INPLACE
@pytest.fixture(params=["failed", "healthy"])
def failed(request) -> bool:
if request.param == "failed":
return True
return False
def test_restore_snapshot_task(
backups, dummy_service, restore_strategy, simulated_service_stopping_delay
backups, dummy_service, restore_strategy, simulated_service_stopping_delay, failed
):
dummy_service.set_delay(simulated_service_stopping_delay)
if failed:
dummy_service.set_status(ServiceStatus.FAILED)
Backups.back_up(dummy_service)
snaps = Backups.get_snapshots(dummy_service)