Embeds: factor out from postMessage

This commit is contained in:
Rory&
2026-03-26 11:46:48 +01:00
parent 47e782ad62
commit daaf6eae28
6 changed files with 202 additions and 134 deletions

View File

@@ -16,7 +16,7 @@
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { EmbedHandlers, randomString } from "@spacebar/api";
import { EmbedHandlers, randomString, fillMessageUrlEmbeds } from "@spacebar/api";
import {
Application,
Attachment,
@@ -69,13 +69,12 @@ import {
UnfurledMediaItem,
BaseMessageComponents,
v1CompTypes,
PartialUser
PartialUser,
} from "@spacebar/schemas";
const allow_empty = false;
// TODO: check webhook, application, system author, stickers
// TODO: embed gifs/videos/images
const LINK_REGEX = /<?https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)>?/g;
function checkActionRow(row: ActionRowComponent, knownComponentIds: string[], errors: Record<string, { code?: string; message: string }>, rowIndex: number) {
if (!row.components) {
return;
@@ -774,139 +773,15 @@ export async function handleMessage(opts: MessageOptions): Promise<Message> {
// TODO: cache link result in db
export async function postHandleMessage(message: Message) {
const conf = Config.get();
const content = message.content?.replace(/ *`[^)]*` */g, ""); // remove markdown
const linkMatches = content?.match(LINK_REGEX) || [];
message.clean_data();
const data = { ...message.toJSON() };
const currentNormalizedUrls = new Set<string>();
for (const link of linkMatches) {
// Don't process links in <>
if (link.startsWith("<") && link.endsWith(">")) {
continue;
}
try {
const normalized = normalizeUrl(link);
currentNormalizedUrls.add(normalized);
} catch (e) {
/* empty */
}
}
if (data.embeds != undefined) {
data.embeds?.forEach((embed) => {
if (!embed.type) {
embed.type = EmbedType.rich;
}
});
}
// Filter out embeds that could be links, start from scratch
if (data.embeds != undefined) {
data.embeds = data.embeds?.filter((embed) => embed.type === "rich");
}
message.embeds ??= [];
message.embeds.forEach((embed) => {
// we need to handle false-y values (empty string) here, so cant use ??=
embed.type ||= EmbedType.rich;
});
const seenNormalizedUrls = new Set<string>();
const uniqueLinks: string[] = [];
for (const link of linkMatches.slice(0, 20)) {
// embed max 20 links - TODO: make this configurable with instance policies
// Don't embed links in <>
if (link.startsWith("<") && link.endsWith(">")) continue;
try {
const normalized = normalizeUrl(link);
if (!seenNormalizedUrls.has(normalized)) {
seenNormalizedUrls.add(normalized);
uniqueLinks.push(link);
}
} catch (e) {
// Invalid URL, skip
}
}
if (uniqueLinks.length === 0) {
// No valid unique links found, update message to remove old embeds
if (data.embeds != undefined) {
data.embeds = data.embeds?.filter((embed) => embed.type === "rich");
}
// author value is already included in message.toJSON()
const event = {
event: "MESSAGE_UPDATE",
channel_id: message.channel_id,
data: {
...message.toJSON(),
embeds: data.embeds == undefined ? message.embeds || [] : data.embeds,
},
} satisfies MessageUpdateEvent;
const embeds = data.embeds == undefined ? [] : data.embeds;
await Promise.all([emitEvent(event), Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: embeds })]);
return;
}
const cachePromises = [];
for (const link of uniqueLinks) {
let url: URL;
try {
url = new URL(link);
} catch (e) {
// Skip invalid URLs
continue;
}
const normalizedUrl = normalizeUrl(link);
// Check cache using normalized URL
const cached = await EmbedCache.findOne({
where: { url: normalizedUrl },
});
if (cached) {
if (data.embeds == undefined) {
data.embeds = [];
}
data.embeds?.push(cached.embed);
continue;
}
// bit gross, but whatever!
const endpointPublic = conf.cdn.endpointPublic; // lol
const handler = url.hostname === new URL(endpointPublic!).hostname ? EmbedHandlers["self"] : EmbedHandlers[url.hostname] || EmbedHandlers["default"];
try {
let res = await handler(url);
if (!res) continue;
// tried to use shorthand but types didn't like me L
if (!Array.isArray(res)) res = [res];
for (const embed of res) {
// Cache with normalized URL
const cache = EmbedCache.create({
url: normalizedUrl,
embed: embed,
});
cachePromises.push(cache.save());
if (data.embeds == undefined) {
data.embeds = [];
}
data.embeds?.push(embed);
}
} catch (e) {
console.error(`[Embeds] Error while generating embed for ${link}`, e);
}
}
const embeds = data.embeds == undefined ? [] : data.embeds;
await Promise.all([
emitEvent({
event: "MESSAGE_UPDATE",
channel_id: message.channel_id,
data: message.toJSON(),
} satisfies MessageUpdateEvent),
Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: embeds }),
...cachePromises,
]);
if ((await getPermission(message.author_id, message.channel.guild_id, message.channel_id)).has(Permissions.FLAGS.EMBED_LINKS)) await fillMessageUrlEmbeds(message);
}
export async function sendMessage(opts: MessageOptions) {

View File

@@ -16,12 +16,13 @@
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { Config } from "@spacebar/util";
import { arrayDistinctBy, arrayGroupBy, arrayRemove, Config, EmbedCache, emitEvent, Message, MessageUpdateEvent, normalizeUrl } from "@spacebar/util";
import { Embed, EmbedImage, EmbedType } from "@spacebar/schemas";
import * as cheerio from "cheerio";
import crypto from "crypto";
import { yellow } from "picocolors";
import probe from "probe-image-size";
import { FindOptionsWhere, In } from "typeorm";
export const DEFAULT_FETCH_OPTIONS: RequestInit = {
redirect: "follow",
@@ -517,3 +518,161 @@ export const EmbedHandlers: {
};
},
};
const LINK_REGEX = /<?https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)>?/g;
export function getMessageContentUrls(message: Message) {
const content = message.content?.replace(/ *`[^)]*` */g, ""); // remove markdown
return content?.match(LINK_REGEX) ?? [];
}
export async function dropDuplicateCacheEntries(entries: EmbedCache[]): Promise<EmbedCache[]> {
const grouped = Array.from(arrayGroupBy(entries, (e) => e.url).values()).map((g) =>
g.toSorted((e1, e2) => {
let diff = e2.createdAt.getTime() - e1.createdAt.getTime();
if (diff == 0) diff = Number(BigInt(e2.id) - BigInt(e1.id));
return diff;
}),
);
const fullToDeleteIds: string[] = [];
for (const group of grouped) {
if (group.length <= 1) continue;
// console.log("[EmbedCache] Removing all but first from cache:", group);
// this might be backwards, sort always confuses me lol
const toDelete = group.slice(1);
const toDeleteIds = toDelete.map((x) => x.id);
fullToDeleteIds.push(...toDeleteIds);
console.warn("[EmbedCache] Removing duplicate IDs for", toDelete[0].url, " - ", toDeleteIds);
}
await EmbedCache.delete({ id: In(fullToDeleteIds) } as FindOptionsWhere<EmbedCache>);
// console.log("[EmbedCache] Cached embeds:", Array.from(grouped.map((x) => x[0].url)));
return Array.from(grouped.map((x) => x[0]));
}
async function sleep(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
// hack to make nodejs not die
function getSlowdownFactor(off: number) {
if (off < 10) return off;
if (off < 25) return 100 + off * 2;
if (off < 50) return 200 + off * 10;
if (off < 100) return 500 + off * 15;
if (off < 250) return 750 + off * 20;
return 1000 + off * 150;
}
export async function getOrUpdateEmbedCache(urls: string[], cb?: (url: string, embeds: Embed[]) => Promise<void>): Promise<EmbedCache[]> {
urls = arrayDistinctBy(urls, (x) => x);
const embeds: EmbedCache[] = [];
const cachedEmbeds = await dropDuplicateCacheEntries(
await EmbedCache.find({
where: {
url: In(urls.map(normalizeUrl)),
},
}),
);
embeds.push(...cachedEmbeds);
cb?.(
"cached",
cachedEmbeds.map((e) => e.embed),
);
const urlsToGenerate = urls.filter((url) => {
return !cachedEmbeds.some((e) => e.url == normalizeUrl(url));
});
if (urlsToGenerate.length > 0) console.log("[Embeds] Need to generate embeds for urls:", urlsToGenerate);
if (cachedEmbeds.length > 0)
console.log(
"[Embeds] Already had embeds for urls:",
cachedEmbeds.map((e) => e.url),
);
let off = 0;
const generatedEmbeds = await Promise.all(
urlsToGenerate.map(async (link) => {
await sleep(getSlowdownFactor(off++)); // ...or nodejs gets overwhelmed and times out
return await getOrUpdateEmbedCacheSingle(link, cb);
}),
);
embeds.push(...generatedEmbeds.filter((e): e is EmbedCache[] => e !== null).flat());
return embeds;
}
async function getOrUpdateEmbedCacheSingle(link: string, cb?: (url: string, embeds: Embed[]) => Promise<void>): Promise<EmbedCache[] | null> {
const url = new URL(link);
const handler = url.hostname === new URL(Config.get().cdn.endpointPublic!).hostname ? EmbedHandlers["self"] : (EmbedHandlers[url.hostname] ?? EmbedHandlers["default"]);
const results: EmbedCache[] = [];
try {
let res = await handler(url);
if (!res) return null;
if (!Array.isArray(res)) res = [res];
for (const embed of res) {
// Cache with normalized URL
const cache = await EmbedCache.create({
url: normalizeUrl(url.href),
embed: embed,
createdAt: new Date(),
}).save();
results.push(cache);
console.log("[Embeds] Generated embed for", link);
}
await cb?.(link, res);
} catch (e) {
console.error(`[Embeds] Error while generating embed for ${link}`, e);
}
return results.length == 0 ? null : results;
}
export async function fillMessageUrlEmbeds(message: Message) {
const linkMatches = getMessageContentUrls(message).filter((l) => !l.startsWith("<") && !l.endsWith(">"));
// Filter out embeds that could be links, start from scratch
message.embeds = message.embeds.filter((embed) => embed.type === "rich");
if (linkMatches.length == 0) return message;
const uniqueLinks: string[] = arrayDistinctBy(linkMatches, normalizeUrl);
if (uniqueLinks.length === 0) {
// No valid unique links found, update message to remove old embeds
message.embeds = message.embeds?.filter((embed) => embed.type === "rich");
await saveAndEmitMessageUpdate(message);
return message;
}
// avoid a race condition updating the same row
let messageUpdateLock = saveAndEmitMessageUpdate(message);
await getOrUpdateEmbedCache(uniqueLinks, async (_, embeds) => {
if (message.embeds.length + embeds.length > Config.get().limits.message.maxEmbeds) return;
message.embeds.push(...embeds);
try {
await messageUpdateLock;
} catch {
/* empty */
}
messageUpdateLock = saveAndEmitMessageUpdate(message);
});
await saveAndEmitMessageUpdate(message);
return message;
}
async function saveAndEmitMessageUpdate(message: Message) {
await Message.update({ id: message.id, channel_id: message.channel_id }, { embeds: message.embeds });
await emitEvent({
event: "MESSAGE_UPDATE",
channel_id: message.channel_id,
data: message.toJSON(),
} satisfies MessageUpdateEvent);
}

View File

@@ -24,4 +24,5 @@ export class MessageLimits {
maxBulkDelete: number = 1000;
maxEmbedDownloadSize: number = 1024 * 1024 * 5;
maxPreloadCount: number = 100;
maxEmbeds: number = 20;
}

View File

@@ -29,4 +29,11 @@ export class EmbedCache extends BaseClass {
@Column({ type: "simple-json" })
embed: Embed;
// TODO: store all returned embed objects from a handler
// @Column({ type: "simple-json" })
// embeds: Embed[];
@Column({ name: "created_at", type: "timestamp with time zone" })
createdAt: Date;
}

View File

@@ -0,0 +1,13 @@
import { MigrationInterface, QueryRunner } from "typeorm";
export class EmbedCacheCreatedAt1772404321402 implements MigrationInterface {
name = "EmbedCacheCreatedAt1772404321402";
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`ALTER TABLE "embed_cache" ADD "created_at" timestamp with time zone DEFAULT now();`);
}
public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`ALTER TABLE "embed_cache" DROP COLUMN "created_at"`);
}
}

View File

@@ -42,3 +42,16 @@ export function arrayDistinctBy<T, M>(array: T[], selector: (elem: T) => M): T[]
return true;
});
}
export function arrayGroupBy<T, M>(array: T[], selector: (elem: T) => M): Map<M, T[]> {
const map = new Map<M, T[]>();
array.forEach((item) => {
const mappedValue = selector(item);
const existing = map.get(mappedValue);
if (existing) existing.push(item);
else map.set(mappedValue, [item]);
});
return map;
}