From f55d8a453d5a251caad2df68100b83cd571c0a1b Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 4 Sep 2023 13:32:26 +0200 Subject: [PATCH] Add health endpoint to appservice and add metrics via prometheus (#70) This adds a `/healthz` endpoint to the appservice which allows this to work more nicely in kubernetes. It also adds some metrics for tracking the provisioning state. Grafana result: ![image](https://github.com/Gnuxie/Draupnir/assets/1374914/9426c8e6-2c1c-469c-8902-1b9e2b6db529) Note: The ts-ignore are sadly required since the `_getValue` method is not public :/ I didnt find another solution apart from tracking it maybe elsewhere. * Add health endpoint to appservice and add metrics via prometheus * Ensure that we dont have duplicate metrics when the appservice is registered multiple times * Move gauge modifications to utils function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix typo --- src/appservice/AppService.ts | 38 ++++++++++++++++++---- src/appservice/MjolnirManager.ts | 27 +++++++++++----- src/utils.ts | 55 ++++++++++++++++++++++++++------ 3 files changed, 97 insertions(+), 23 deletions(-) diff --git a/src/appservice/AppService.ts b/src/appservice/AppService.ts index f2687cc8..51912d60 100644 --- a/src/appservice/AppService.ts +++ b/src/appservice/AppService.ts @@ -25,7 +25,7 @@ limitations under the License. * are NOT distributed, contributed, committed, or licensed under the Apache License. */ -import { AppServiceRegistration, Bridge, Request, WeakEvent, BridgeContext, MatrixUser, Logger } from "matrix-appservice-bridge"; +import { AppServiceRegistration, Bridge, Request, WeakEvent, BridgeContext, MatrixUser, Logger, setBridgeVersion, PrometheusMetrics } from "matrix-appservice-bridge"; import { MjolnirManager } from ".//MjolnirManager"; import { DataStore } from ".//datastore"; import { PgDataStore } from "./postgres/PgDataStore"; @@ -33,6 +33,8 @@ import { Api } from "./Api"; import { IConfig } from "./config/config"; import { AccessControl } from "./AccessControl"; import { AppserviceCommandHandler } from "./bot/AppserviceCommandHandler"; +import { SOFTWARE_VERSION } from "../config"; +import { Registry } from 'prom-client'; const log = new Logger("AppService"); /** @@ -54,6 +56,7 @@ export class MjolnirAppService { public readonly mjolnirManager: MjolnirManager, public readonly accessControl: AccessControl, private readonly dataStore: DataStore, + private readonly prometheusMetrics: PrometheusMetrics ) { this.api = new Api(config.homeserver.url, mjolnirManager); this.commands = new AppserviceCommandHandler(this); @@ -75,21 +78,35 @@ export class MjolnirAppService { // It also allows us to combine constructor/initialize logic // to make the code base much simpler. A small hack to pay for an overall less hacky code base. controller: { - onUserQuery: () => {throw new Error("Mjolnir uninitialized")}, - onEvent: () => {throw new Error("Mjolnir uninitialized")}, + onUserQuery: () => { throw new Error("Mjolnir uninitialized") }, + onEvent: () => { throw new Error("Mjolnir uninitialized") }, }, suppressEcho: false, }); await bridge.initialise(); const accessControlListId = await bridge.getBot().getClient().resolveRoom(config.adminRoom); const accessControl = await AccessControl.setupAccessControl(accessControlListId, bridge); - const mjolnirManager = await MjolnirManager.makeMjolnirManager(dataStore, bridge, accessControl); + // Activate /metrics endpoint for Prometheus + + // This should happen automatically but in testing this didn't happen in the docker image + setBridgeVersion(SOFTWARE_VERSION); + + // Due to the way the tests and this prom library works we need to explicitly create a new one each time. + const prometheus = bridge.getPrometheusMetrics(true, new Registry()); + const instanceCountGauge = prometheus.addGauge({ + name: "draupnir_instances", + help: "Count of Draupnir Instances", + labels: ["status", "uuid"], + }); + + const mjolnirManager = await MjolnirManager.makeMjolnirManager(dataStore, bridge, accessControl, instanceCountGauge); const appService = new MjolnirAppService( config, bridge, mjolnirManager, accessControl, - dataStore + dataStore, + prometheus ); bridge.opts.controller = { onUserQuery: appService.onUserQuery.bind(appService), @@ -114,7 +131,7 @@ export class MjolnirAppService { return service; } - public onUserQuery (queriedUser: MatrixUser) { + public onUserQuery(queriedUser: MatrixUser) { return {}; // auto-provision users with no additonal data } @@ -160,6 +177,15 @@ export class MjolnirAppService { log.info("Starting MjolnirAppService, Matrix-side to listen on port", port); this.api.start(this.config.webAPI.port); await this.bridge.listen(port); + this.prometheusMetrics.addAppServicePath(this.bridge); + this.bridge.addAppServicePath({ + method: "GET", + path: "/healthz", + authenticate: false, + handler: async (_req, res) => { + res.status(200).send('ok'); + } + }); log.info("MjolnirAppService started successfully"); } diff --git a/src/appservice/MjolnirManager.ts b/src/appservice/MjolnirManager.ts index 5f3d8137..a6a6645a 100644 --- a/src/appservice/MjolnirManager.ts +++ b/src/appservice/MjolnirManager.ts @@ -11,6 +11,8 @@ import EventEmitter from "events"; import { MatrixEmitter } from "../MatrixEmitter"; import { Permalinks } from "../commands/interface-manager/Permalinks"; import { MatrixRoomReference } from "../commands/interface-manager/MatrixRoomReference"; +import { Gauge } from "prom-client"; +import { decrementGaugeValue, incrementGaugeValue } from "../utils"; const log = new Logger('MjolnirManager'); @@ -30,7 +32,8 @@ export class MjolnirManager { private constructor( private readonly dataStore: DataStore, private readonly bridge: Bridge, - private readonly accessControl: AccessControl + private readonly accessControl: AccessControl, + private readonly instanceCountGauge: Gauge<"status" | "uuid"> ) { } @@ -42,8 +45,8 @@ export class MjolnirManager { * @param accessControl Who has access to the bridge. * @returns A new mjolnir manager. */ - public static async makeMjolnirManager(dataStore: DataStore, bridge: Bridge, accessControl: AccessControl): Promise { - const mjolnirManager = new MjolnirManager(dataStore, bridge, accessControl); + public static async makeMjolnirManager(dataStore: DataStore, bridge: Bridge, accessControl: AccessControl, instanceCountGauge: Gauge<"status" | "uuid">): Promise { + const mjolnirManager = new MjolnirManager(dataStore, bridge, accessControl, instanceCountGauge); await mjolnirManager.startMjolnirs(await dataStore.list()); return mjolnirManager; } @@ -55,7 +58,7 @@ export class MjolnirManager { * @param client A client for the appservice virtual user that the new mjolnir should use. * @returns A new managed mjolnir. */ - public async makeInstance(requestingUserId: string, managementRoomId: string, client: MatrixClient): Promise { + public async makeInstance(localPart: string, requestingUserId: string, managementRoomId: string, client: MatrixClient): Promise { const mxid = await client.getUserId(); const intentListener = new MatrixIntentListener(mxid); const managedMjolnir = new ManagedMjolnir( @@ -70,6 +73,9 @@ export class MjolnirManager { await managedMjolnir.start(); this.mjolnirs.set(mxid, managedMjolnir); this.unstartedMjolnirs.delete(mxid); + incrementGaugeValue(this.instanceCountGauge, "offline", localPart); + decrementGaugeValue(this.instanceCountGauge, "disabled", localPart); + incrementGaugeValue(this.instanceCountGauge, "online", localPart); return managedMjolnir; } @@ -79,7 +85,7 @@ export class MjolnirManager { * @param ownerId The owner of the mjolnir. We ask for it explicitly to not leak access to another user's mjolnir. * @returns The matching managed mjolnir instance. */ - public getMjolnir(mjolnirId: string, ownerId: string): ManagedMjolnir|undefined { + public getMjolnir(mjolnirId: string, ownerId: string): ManagedMjolnir | undefined { const mjolnir = this.mjolnirs.get(mjolnirId); if (mjolnir) { if (mjolnir.ownerId !== ownerId) { @@ -141,7 +147,7 @@ export class MjolnirManager { } }); - const mjolnir = await this.makeInstance(requestingUserId, managementRoomId, mjIntent.matrixClient); + const mjolnir = await this.makeInstance(mjolnirLocalPart, requestingUserId, managementRoomId, mjIntent.matrixClient); await mjolnir.createFirstList(requestingUserId, "list"); await this.dataStore.store({ @@ -164,7 +170,7 @@ export class MjolnirManager { return [...this.unstartedMjolnirs.values()]; } - public findUnstartedMjolnir(localPart: string): UnstartedMjolnir|undefined { + public findUnstartedMjolnir(localPart: string): UnstartedMjolnir | undefined { return [...this.unstartedMjolnirs.values()].find(unstarted => unstarted.mjolnirRecord.local_part === localPart); } @@ -195,8 +201,11 @@ export class MjolnirManager { // Don't await, we don't want to clobber initialization just because we can't tell someone they're no longer allowed. mjIntent.matrixClient.sendNotice(mjolnirRecord.management_room, `Your mjolnir has been disabled by the administrator: ${access.rule?.reason ?? "no reason supplied"}`); this.reportUnstartedMjolnir(UnstartedMjolnir.FailCode.Unauthorized, access.outcome, mjolnirRecord, mjIntent.userId); + decrementGaugeValue(this.instanceCountGauge, "online", mjolnirRecord.local_part); + incrementGaugeValue(this.instanceCountGauge, "disabled", mjolnirRecord.local_part); } else { await this.makeInstance( + mjolnirRecord.local_part, mjolnirRecord.owner, mjolnirRecord.management_room, mjIntent.matrixClient, @@ -205,6 +214,8 @@ export class MjolnirManager { // Don't await, we don't want to clobber initialization if this fails. mjIntent.matrixClient.sendNotice(mjolnirRecord.management_room, `Your mjolnir could not be started. Please alert the administrator`); this.reportUnstartedMjolnir(UnstartedMjolnir.FailCode.StartError, e, mjolnirRecord, mjIntent.userId); + decrementGaugeValue(this.instanceCountGauge, "online", mjolnirRecord.local_part); + incrementGaugeValue(this.instanceCountGauge, "offline", mjolnirRecord.local_part); }); } } @@ -279,7 +290,7 @@ export class MatrixIntentListener extends EventEmitter implements MatrixEmitter public handleEvent(mxEvent: WeakEvent) { // These are ordered to be the same as matrix-bot-sdk's MatrixClient // They shouldn't need to be, but they are just in case it matters. - if (mxEvent['type'] === 'm.room.member' && mxEvent.state_key === this.mjolnirId) { + if (mxEvent['type'] === 'm.room.member' && mxEvent.state_key === this.mjolnirId) { if (mxEvent['content']['membership'] === 'leave') { this.emit('room.leave', mxEvent.room_id, mxEvent); } diff --git a/src/utils.ts b/src/utils.ts index 46896010..5ec7c71a 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -41,6 +41,7 @@ import * as _ from '@sentry/tracing'; // Performing the import activates tracing import ManagementRoomOutput from "./ManagementRoomOutput"; import { IConfig } from "./config"; import { MatrixSendClient } from "./MatrixEmitter"; +import { Gauge } from "prom-client"; // Define a few aliases to simplify parsing durations. @@ -70,6 +71,42 @@ export function setToArray(set: Set): T[] { return arr; } +/** + * This increments a prometheus gauge. Used in the Appservice MjolnirManager. + * + * The ts-ignore is mandatory since we access a private method due to lack of a public one. + * + * See https://github.com/Gnuxie/Draupnir/pull/70#discussion_r1299188922 + * + * @param gauge The Gauge to be modified + * @param status The status value that should be modified + * @param uuid The UUID of the instance. (Usually the localPart) + */ +export function incrementGaugeValue(gauge: Gauge<"status" | "uuid">, status: "offline" | "disabled" | "online", uuid: string) { + // @ts-ignore + if (!gauge._getValue({ status: status, uuid: uuid })) { + gauge.inc({ status: status, uuid: uuid }); + } +} + +/** + * This decrements a prometheus gauge. Used in the Appservice MjolnirManager. + * + * The ts-ignore is mandatory since we access a private method due to lack of a public one. + * + * See https://github.com/Gnuxie/Draupnir/pull/70#discussion_r1299188922 + * + * @param gauge The Gauge to be modified + * @param status The status value that should be modified + * @param uuid The UUID of the instance. (Usually the localPart) + */ +export function decrementGaugeValue(gauge: Gauge<"status" | "uuid">, status: "offline" | "disabled" | "online", uuid: string) { + // @ts-ignore + if (gauge._getValue({ status: status, uuid: uuid })) { + gauge.dec({ status: status, uuid: uuid }); + } +} + export function isTrueJoinEvent(event: any): boolean { const membership = event['content']['membership'] || 'join'; let prevMembership = "leave"; @@ -134,7 +171,7 @@ export async function getMessagesByUserIn(client: MatrixSendClient, sender: stri const isGlob = sender.includes("*"); const roomEventFilter = { rooms: [roomId], - ... isGlob ? {} : {senders: [sender]} + ...isGlob ? {} : { senders: [sender] } }; const matcher = new MatrixGlob(sender); @@ -167,11 +204,11 @@ export async function getMessagesByUserIn(client: MatrixSendClient, sender: stri * if `null`, start from the most recent point in the timeline. * @returns The response part of the `/messages` API, see `BackfillResponse`. */ - async function backfill(from: string|null): Promise { + async function backfill(from: string | null): Promise { const qs = { filter: JSON.stringify(roomEventFilter), dir: "b", - ... from ? { from } : {} + ...from ? { from } : {} }; LogService.info("utils", "Backfilling with token: " + from); return client.doRequest("GET", `/_matrix/client/v3/rooms/${encodeURIComponent(roomId)}/messages`, qs); @@ -195,10 +232,10 @@ export async function getMessagesByUserIn(client: MatrixSendClient, sender: stri } // We check that we have the token because rooms/messages is not required to provide one // and will not provide one when there is no more history to paginate. - let token: string|null = null; + let token: string | null = null; do { const bfMessages: BackfillResponse = await backfill(token); - const previousToken: string|null = token; + const previousToken: string | null = token; token = bfMessages['end'] ?? null; const events = filterEvents(bfMessages['chunk'] || []); // If we are using a glob, there may be no relevant events in this chunk. @@ -287,13 +324,13 @@ function patchMatrixClientForConciseExceptions() { const method: string | undefined = err.method ? err.method : "req" in err && err.req instanceof ClientRequest - ? err.req.method - : params.method; + ? err.req.method + : params.method; const path: string = err.url ? err.url : "req" in err && err.req instanceof ClientRequest - ? err.req.path - : params.uri ?? ''; + ? err.req.path + : params.uri ?? ''; let body: unknown = null; if ("body" in err) { body = err.body;