Skip to content
Snippets Groups Projects
Commit 586df9f8 authored by Daniel Göbel's avatar Daniel Göbel
Browse files

Merge branch 'feature/57-add-monitoring-of-traces' into 'development'

Resolve "Add monitoring of traces based on OpenTelemetry"

Closes #57

See merge request cmg/clowm/clowm-workflow-service!54
parents fb107dcc e7cc9859
No related branches found
No related tags found
No related merge requests found
from fastapi import FastAPI
from fastapi import FastAPI, Request, Response
from fastapi.exception_handlers import http_exception_handler, request_validation_exception_handler
from fastapi.exceptions import RequestValidationError, StarletteHTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.responses import JSONResponse
from fastapi.routing import APIRoute
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace import Status, StatusCode
from app.api.api import api_router
from app.api.miscellaneous_endpoints import miscellaneous_router
......@@ -30,6 +40,31 @@ app = FastAPI(
root_path=settings.API_PREFIX,
)
if settings.OTLP_GRPC_ENDPOINT is not None:
resource = Resource(attributes={SERVICE_NAME: "clowm-workflow-service"})
provider = TracerProvider(resource=resource)
provider.add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint=settings.OTLP_GRPC_ENDPOINT, insecure=True))
)
trace.set_tracer_provider(provider)
@app.exception_handler(StarletteHTTPException)
async def trace_http_exception_handler(request: Request, exc: StarletteHTTPException) -> Response:
current_span = trace.get_current_span()
current_span.set_status(Status(StatusCode.ERROR))
current_span.record_exception(exc)
return await http_exception_handler(request, exc)
@app.exception_handler(RequestValidationError)
async def trace_validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
current_span = trace.get_current_span()
current_span.set_status(Status(StatusCode.ERROR))
current_span.record_exception(exc)
return await request_validation_exception_handler(request, exc)
FastAPIInstrumentor.instrument_app(app, excluded_urls="health", tracer_provider=trace.get_tracer_provider())
# CORS Settings for the API
app.add_middleware(
CORSMiddleware,
......
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from typing import Any
from opentelemetry.trace import Tracer
@asynccontextmanager
async def start_as_current_span_async(
*args: Any,
tracer: Tracer,
**kwargs: Any,
) -> AsyncGenerator[None, None]:
"""Start a new span and set it as the current span.
Args:
*args: Arguments to pass to the tracer.start_as_current_span method
tracer: Tracer to use to start the span
**kwargs: Keyword arguments to pass to the tracer.start_as_current_span method
Yields:
None
"""
with tracer.start_as_current_span(*args, **kwargs):
yield
......@@ -5,9 +5,12 @@ from uuid import UUID
import dotenv
from fastapi import status
from httpx import AsyncClient
from opentelemetry import trace
from app.core.config import settings
tracer = trace.get_tracer_provider().get_tracer(__name__)
dotenv.load_dotenv()
base_env = {key: val for key, val in environ.items() if key.startswith("NXF_")}
......@@ -61,9 +64,11 @@ class SlurmClient:
"environment": env,
},
}
response = await self._client.post(
f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/submit", headers=self._headers, json=body
)
with tracer.start_as_current_span("slurm_submit_job"):
response = await self._client.post(
f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/submit", headers=self._headers, json=body
)
return int(response.json()["job_id"])
async def cancel_job(self, job_id: int) -> None:
......@@ -75,7 +80,10 @@ class SlurmClient:
job_id : int
ID of the job to cancel.
"""
await self._client.delete(f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/{job_id}", headers=self._headers)
with tracer.start_as_current_span("slurm_cancel_job"):
await self._client.delete(
f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/{job_id}", headers=self._headers
)
async def is_job_finished(self, job_id: int) -> bool:
"""
......@@ -91,13 +99,17 @@ class SlurmClient:
finished : bool
Flag if the job is finished
"""
response = await self._client.get(
f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/{job_id}", headers=self._headers
)
if response.status_code != status.HTTP_200_OK:
return True
try:
job_state = response.json()["jobs"][0]["job_state"]
return job_state == "COMPLETED" or job_state == "FAILED" or job_state == "CANCELLED"
except (KeyError, IndexError):
return True
with tracer.start_as_current_span("slurm_check_job_status") as span:
response = await self._client.get(
f"{settings.SLURM_ENDPOINT}slurm/{self.version}/job/{job_id}", headers=self._headers
)
span.set_attribute("slurm.job-status.request.code", response.status_code)
if response.status_code != status.HTTP_200_OK:
return True
try: # pragma: no cover
job_state = response.json()["jobs"][0]["job_state"]
span.set_attribute("slurm.job-status.state", job_state)
return job_state == "COMPLETED" or job_state == "FAILED" or job_state == "CANCELLED"
except (KeyError, IndexError) as ex:
span.record_exception(ex)
return True
......@@ -14,6 +14,7 @@ target-version = "py310"
plugins = ["pydantic.mypy", "sqlalchemy.ext.mypy.plugin"]
ignore_missing_imports = true
disallow_untyped_defs = true
namespace_packages = true
[tool.coverage.run]
concurrency = [
......
......@@ -24,3 +24,7 @@ jsonschema>=4.0.0,<5.0.0
mako
python-dotenv
Pillow>=10.0.0,<10.1.0
# Monitoring
opentelemetry-instrumentation-fastapi
opentelemetry-exporter-otlp-proto-grpc
......@@ -2,7 +2,14 @@
set -x
ruff --version
ruff check app
isort --version
isort -c app
black --version
black app --check
mypy --version
mypy app
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment