From daaf6eae286605e5ddd926fe13956edce5d2c08b Mon Sep 17 00:00:00 2001 From: Rory& Date: Thu, 26 Mar 2026 11:46:48 +0100 Subject: [PATCH] Embeds: factor out from postMessage --- src/api/util/handlers/Message.ts | 141 +-------------- src/api/util/utility/EmbedHandlers.ts | 161 +++++++++++++++++- .../subconfigurations/limits/MessageLimits.ts | 1 + src/util/entities/EmbedCache.ts | 7 + .../1772404321402-EmbedCacheCreatedAt.ts | 13 ++ src/util/util/extensions/Array.ts | 13 ++ 6 files changed, 202 insertions(+), 134 deletions(-) create mode 100644 src/util/migration/postgres/1772404321402-EmbedCacheCreatedAt.ts diff --git a/src/api/util/handlers/Message.ts b/src/api/util/handlers/Message.ts index ed41341e4..1fb9a080d 100644 --- a/src/api/util/handlers/Message.ts +++ b/src/api/util/handlers/Message.ts @@ -16,7 +16,7 @@ along with this program. If not, see . */ -import { EmbedHandlers, randomString } from "@spacebar/api"; +import { EmbedHandlers, randomString, fillMessageUrlEmbeds } from "@spacebar/api"; import { Application, Attachment, @@ -69,13 +69,12 @@ import { UnfurledMediaItem, BaseMessageComponents, v1CompTypes, - PartialUser + PartialUser, } from "@spacebar/schemas"; const allow_empty = false; // TODO: check webhook, application, system author, stickers // TODO: embed gifs/videos/images -const LINK_REGEX = /?/g; function checkActionRow(row: ActionRowComponent, knownComponentIds: string[], errors: Record, rowIndex: number) { if (!row.components) { return; @@ -774,139 +773,15 @@ export async function handleMessage(opts: MessageOptions): Promise { // TODO: cache link result in db export async function postHandleMessage(message: Message) { - const conf = Config.get(); - const content = message.content?.replace(/ *`[^)]*` */g, ""); // remove markdown - - const linkMatches = content?.match(LINK_REGEX) || []; message.clean_data(); - const data = { ...message.toJSON() }; - const currentNormalizedUrls = new Set(); - for (const link of linkMatches) { - // Don't process links in <> - if (link.startsWith("<") && link.endsWith(">")) { - continue; - } - try { - const normalized = normalizeUrl(link); - currentNormalizedUrls.add(normalized); - } catch (e) { - /* empty */ - } - } - if (data.embeds != undefined) { - data.embeds?.forEach((embed) => { - if (!embed.type) { - embed.type = EmbedType.rich; - } - }); - } - // Filter out embeds that could be links, start from scratch - if (data.embeds != undefined) { - data.embeds = data.embeds?.filter((embed) => embed.type === "rich"); - } + message.embeds ??= []; + message.embeds.forEach((embed) => { + // we need to handle false-y values (empty string) here, so cant use ??= + embed.type ||= EmbedType.rich; + }); - const seenNormalizedUrls = new Set(); - const uniqueLinks: string[] = []; - - for (const link of linkMatches.slice(0, 20)) { - // embed max 20 links - TODO: make this configurable with instance policies - // Don't embed links in <> - if (link.startsWith("<") && link.endsWith(">")) continue; - - try { - const normalized = normalizeUrl(link); - - if (!seenNormalizedUrls.has(normalized)) { - seenNormalizedUrls.add(normalized); - uniqueLinks.push(link); - } - } catch (e) { - // Invalid URL, skip - } - } - - if (uniqueLinks.length === 0) { - // No valid unique links found, update message to remove old embeds - if (data.embeds != undefined) { - data.embeds = data.embeds?.filter((embed) => embed.type === "rich"); - } - // author value is already included in message.toJSON() - const event = { - event: "MESSAGE_UPDATE", - channel_id: message.channel_id, - data: { - ...message.toJSON(), - embeds: data.embeds == undefined ? message.embeds || [] : data.embeds, - }, - } satisfies MessageUpdateEvent; - const embeds = data.embeds == undefined ? [] : data.embeds; - await Promise.all([emitEvent(event), Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: embeds })]); - return; - } - - const cachePromises = []; - - for (const link of uniqueLinks) { - let url: URL; - try { - url = new URL(link); - } catch (e) { - // Skip invalid URLs - continue; - } - - const normalizedUrl = normalizeUrl(link); - - // Check cache using normalized URL - const cached = await EmbedCache.findOne({ - where: { url: normalizedUrl }, - }); - - if (cached) { - if (data.embeds == undefined) { - data.embeds = []; - } - data.embeds?.push(cached.embed); - continue; - } - - // bit gross, but whatever! - const endpointPublic = conf.cdn.endpointPublic; // lol - const handler = url.hostname === new URL(endpointPublic!).hostname ? EmbedHandlers["self"] : EmbedHandlers[url.hostname] || EmbedHandlers["default"]; - - try { - let res = await handler(url); - if (!res) continue; - // tried to use shorthand but types didn't like me L - if (!Array.isArray(res)) res = [res]; - - for (const embed of res) { - // Cache with normalized URL - const cache = EmbedCache.create({ - url: normalizedUrl, - embed: embed, - }); - cachePromises.push(cache.save()); - if (data.embeds == undefined) { - data.embeds = []; - } - data.embeds?.push(embed); - } - } catch (e) { - console.error(`[Embeds] Error while generating embed for ${link}`, e); - } - } - const embeds = data.embeds == undefined ? [] : data.embeds; - await Promise.all([ - emitEvent({ - event: "MESSAGE_UPDATE", - channel_id: message.channel_id, - data: message.toJSON(), - } satisfies MessageUpdateEvent), - Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: embeds }), - ...cachePromises, - ]); + if ((await getPermission(message.author_id, message.channel.guild_id, message.channel_id)).has(Permissions.FLAGS.EMBED_LINKS)) await fillMessageUrlEmbeds(message); } export async function sendMessage(opts: MessageOptions) { diff --git a/src/api/util/utility/EmbedHandlers.ts b/src/api/util/utility/EmbedHandlers.ts index 77d0c8f0c..da6a74294 100644 --- a/src/api/util/utility/EmbedHandlers.ts +++ b/src/api/util/utility/EmbedHandlers.ts @@ -16,12 +16,13 @@ along with this program. If not, see . */ -import { Config } from "@spacebar/util"; +import { arrayDistinctBy, arrayGroupBy, arrayRemove, Config, EmbedCache, emitEvent, Message, MessageUpdateEvent, normalizeUrl } from "@spacebar/util"; import { Embed, EmbedImage, EmbedType } from "@spacebar/schemas"; import * as cheerio from "cheerio"; import crypto from "crypto"; import { yellow } from "picocolors"; import probe from "probe-image-size"; +import { FindOptionsWhere, In } from "typeorm"; export const DEFAULT_FETCH_OPTIONS: RequestInit = { redirect: "follow", @@ -517,3 +518,161 @@ export const EmbedHandlers: { }; }, }; + +const LINK_REGEX = /?/g; + +export function getMessageContentUrls(message: Message) { + const content = message.content?.replace(/ *`[^)]*` */g, ""); // remove markdown + + return content?.match(LINK_REGEX) ?? []; +} + +export async function dropDuplicateCacheEntries(entries: EmbedCache[]): Promise { + const grouped = Array.from(arrayGroupBy(entries, (e) => e.url).values()).map((g) => + g.toSorted((e1, e2) => { + let diff = e2.createdAt.getTime() - e1.createdAt.getTime(); + if (diff == 0) diff = Number(BigInt(e2.id) - BigInt(e1.id)); + return diff; + }), + ); + + const fullToDeleteIds: string[] = []; + for (const group of grouped) { + if (group.length <= 1) continue; + // console.log("[EmbedCache] Removing all but first from cache:", group); + // this might be backwards, sort always confuses me lol + const toDelete = group.slice(1); + const toDeleteIds = toDelete.map((x) => x.id); + fullToDeleteIds.push(...toDeleteIds); + console.warn("[EmbedCache] Removing duplicate IDs for", toDelete[0].url, " - ", toDeleteIds); + } + + await EmbedCache.delete({ id: In(fullToDeleteIds) } as FindOptionsWhere); + + // console.log("[EmbedCache] Cached embeds:", Array.from(grouped.map((x) => x[0].url))); + return Array.from(grouped.map((x) => x[0])); +} + +async function sleep(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +// hack to make nodejs not die +function getSlowdownFactor(off: number) { + if (off < 10) return off; + if (off < 25) return 100 + off * 2; + if (off < 50) return 200 + off * 10; + if (off < 100) return 500 + off * 15; + if (off < 250) return 750 + off * 20; + return 1000 + off * 150; +} + +export async function getOrUpdateEmbedCache(urls: string[], cb?: (url: string, embeds: Embed[]) => Promise): Promise { + urls = arrayDistinctBy(urls, (x) => x); + const embeds: EmbedCache[] = []; + + const cachedEmbeds = await dropDuplicateCacheEntries( + await EmbedCache.find({ + where: { + url: In(urls.map(normalizeUrl)), + }, + }), + ); + embeds.push(...cachedEmbeds); + cb?.( + "cached", + cachedEmbeds.map((e) => e.embed), + ); + + const urlsToGenerate = urls.filter((url) => { + return !cachedEmbeds.some((e) => e.url == normalizeUrl(url)); + }); + + if (urlsToGenerate.length > 0) console.log("[Embeds] Need to generate embeds for urls:", urlsToGenerate); + if (cachedEmbeds.length > 0) + console.log( + "[Embeds] Already had embeds for urls:", + cachedEmbeds.map((e) => e.url), + ); + + let off = 0; + const generatedEmbeds = await Promise.all( + urlsToGenerate.map(async (link) => { + await sleep(getSlowdownFactor(off++)); // ...or nodejs gets overwhelmed and times out + return await getOrUpdateEmbedCacheSingle(link, cb); + }), + ); + + embeds.push(...generatedEmbeds.filter((e): e is EmbedCache[] => e !== null).flat()); + + return embeds; +} + +async function getOrUpdateEmbedCacheSingle(link: string, cb?: (url: string, embeds: Embed[]) => Promise): Promise { + const url = new URL(link); + const handler = url.hostname === new URL(Config.get().cdn.endpointPublic!).hostname ? EmbedHandlers["self"] : (EmbedHandlers[url.hostname] ?? EmbedHandlers["default"]); + const results: EmbedCache[] = []; + try { + let res = await handler(url); + if (!res) return null; + if (!Array.isArray(res)) res = [res]; + + for (const embed of res) { + // Cache with normalized URL + const cache = await EmbedCache.create({ + url: normalizeUrl(url.href), + embed: embed, + createdAt: new Date(), + }).save(); + results.push(cache); + console.log("[Embeds] Generated embed for", link); + } + await cb?.(link, res); + } catch (e) { + console.error(`[Embeds] Error while generating embed for ${link}`, e); + } + return results.length == 0 ? null : results; +} + +export async function fillMessageUrlEmbeds(message: Message) { + const linkMatches = getMessageContentUrls(message).filter((l) => !l.startsWith("<") && !l.endsWith(">")); + + // Filter out embeds that could be links, start from scratch + message.embeds = message.embeds.filter((embed) => embed.type === "rich"); + + if (linkMatches.length == 0) return message; + + const uniqueLinks: string[] = arrayDistinctBy(linkMatches, normalizeUrl); + + if (uniqueLinks.length === 0) { + // No valid unique links found, update message to remove old embeds + message.embeds = message.embeds?.filter((embed) => embed.type === "rich"); + await saveAndEmitMessageUpdate(message); + return message; + } + + // avoid a race condition updating the same row + let messageUpdateLock = saveAndEmitMessageUpdate(message); + await getOrUpdateEmbedCache(uniqueLinks, async (_, embeds) => { + if (message.embeds.length + embeds.length > Config.get().limits.message.maxEmbeds) return; + message.embeds.push(...embeds); + try { + await messageUpdateLock; + } catch { + /* empty */ + } + messageUpdateLock = saveAndEmitMessageUpdate(message); + }); + + await saveAndEmitMessageUpdate(message); + return message; +} + +async function saveAndEmitMessageUpdate(message: Message) { + await Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: message.embeds }); + await emitEvent({ + event: "MESSAGE_UPDATE", + channel_id: message.channel_id, + data: message.toJSON(), + } satisfies MessageUpdateEvent); +} diff --git a/src/util/config/types/subconfigurations/limits/MessageLimits.ts b/src/util/config/types/subconfigurations/limits/MessageLimits.ts index f61c0b651..50ea32d22 100644 --- a/src/util/config/types/subconfigurations/limits/MessageLimits.ts +++ b/src/util/config/types/subconfigurations/limits/MessageLimits.ts @@ -24,4 +24,5 @@ export class MessageLimits { maxBulkDelete: number = 1000; maxEmbedDownloadSize: number = 1024 * 1024 * 5; maxPreloadCount: number = 100; + maxEmbeds: number = 20; } diff --git a/src/util/entities/EmbedCache.ts b/src/util/entities/EmbedCache.ts index db1fbba67..a9ffd063c 100644 --- a/src/util/entities/EmbedCache.ts +++ b/src/util/entities/EmbedCache.ts @@ -29,4 +29,11 @@ export class EmbedCache extends BaseClass { @Column({ type: "simple-json" }) embed: Embed; + + // TODO: store all returned embed objects from a handler + // @Column({ type: "simple-json" }) + // embeds: Embed[]; + + @Column({ name: "created_at", type: "timestamp with time zone" }) + createdAt: Date; } diff --git a/src/util/migration/postgres/1772404321402-EmbedCacheCreatedAt.ts b/src/util/migration/postgres/1772404321402-EmbedCacheCreatedAt.ts new file mode 100644 index 000000000..bf845ff74 --- /dev/null +++ b/src/util/migration/postgres/1772404321402-EmbedCacheCreatedAt.ts @@ -0,0 +1,13 @@ +import { MigrationInterface, QueryRunner } from "typeorm"; + +export class EmbedCacheCreatedAt1772404321402 implements MigrationInterface { + name = "EmbedCacheCreatedAt1772404321402"; + + public async up(queryRunner: QueryRunner): Promise { + await queryRunner.query(`ALTER TABLE "embed_cache" ADD "created_at" timestamp with time zone DEFAULT now();`); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`ALTER TABLE "embed_cache" DROP COLUMN "created_at"`); + } +} diff --git a/src/util/util/extensions/Array.ts b/src/util/util/extensions/Array.ts index bb5f3362a..61cf469b2 100644 --- a/src/util/util/extensions/Array.ts +++ b/src/util/util/extensions/Array.ts @@ -42,3 +42,16 @@ export function arrayDistinctBy(array: T[], selector: (elem: T) => M): T[] return true; }); } + +export function arrayGroupBy(array: T[], selector: (elem: T) => M): Map { + const map = new Map(); + + array.forEach((item) => { + const mappedValue = selector(item); + const existing = map.get(mappedValue); + if (existing) existing.push(item); + else map.set(mappedValue, [item]); + }); + + return map; +}