@atcute/bluesky-richtext-parser#

tokenizer for parsing Bluesky rich text syntax.

npm install @atcute/bluesky-richtext-parser

parses user input text into tokens for mentions, hashtags, links, and text formatting. supports Bluesky's standard syntax plus Markdown-style formatting extensions.

usage#

basic parsing#

import { tokenize } from '@atcute/bluesky-richtext-parser';

const tokens = tokenize('hello @alice.bsky.social! check out #atproto');

// [
//   { type: 'text', raw: 'hello ', content: 'hello ' },
//   { type: 'mention', raw: '@alice.bsky.social', handle: 'alice.bsky.social' },
//   { type: 'text', raw: '! check out ', content: '! check out ' },
//   { type: 'topic', raw: '#atproto', name: 'atproto' }
// ]

supported syntax#

mentions#

tokenize('@alice.bsky.social');
// -> [{ type: 'mention', handle: 'alice.bsky.social' }]

tokenize('＠alice.bsky.social'); // fullwidth @ also works
// -> [{ type: 'mention', handle: 'alice.bsky.social' }]

hashtags (topics)#

tokenize('#atproto');
// -> [{ type: 'topic', name: 'atproto' }]

tokenize('＃atproto'); // fullwidth # also works
// -> [{ type: 'topic', name: 'atproto' }]

auto-linked URLs#

bare URLs are automatically detected:

tokenize('check out https://example.com');
// -> [
//   { type: 'text', content: 'check out ' },
//   { type: 'autolink', url: 'https://example.com' }
// ]

markdown links#

tokenize('[my website](https://example.com)');
// -> [{ type: 'link', url: 'https://example.com', children: [{ type: 'text', content: 'my website' }] }]

link text can contain nested formatting:

tokenize('[**bold link**](https://example.com)');
// -> [{ type: 'link', children: [{ type: 'strong', ... }] }]

text formatting#

// bold
tokenize('**bold text**');
// -> [{ type: 'strong', children: [{ type: 'text', content: 'bold text' }] }]

// italic
tokenize('*italic text*');
// -> [{ type: 'emphasis', children: [...] }]

tokenize('_also italic_');
// -> [{ type: 'emphasis', children: [...] }]

// underline
tokenize('__underlined__');
// -> [{ type: 'underline', children: [...] }]

// strikethrough
tokenize('~~deleted~~');
// -> [{ type: 'delete', children: [...] }]

// inline code
tokenize('use `npm install`');
// -> [{ type: 'text', ... }, { type: 'code', content: 'npm install' }]

emotes#

tokenize('hello :wave:');
// -> [{ type: 'text', ... }, { type: 'emote', name: 'wave' }]

escapes#

backslash escapes special characters:

tokenize('not a \\@mention');
// -> [{ type: 'text', ... }, { type: 'escape', escaped: '@' }, { type: 'text', ... }]

handling tokens#

process tokens to build facets or render content:

import { tokenize, type Token } from '@atcute/bluesky-richtext-parser';
import RichtextBuilder from '@atcute/bluesky-richtext-builder';

const resolveHandle = async (handle: string): Promise<string | null> => {
	// resolve handle to DID
};

const processTokens = async (tokens: Token[]): Promise<RichtextBuilder> => {
	const rt = new RichtextBuilder();

	for (const token of tokens) {
		switch (token.type) {
			case 'text':
				rt.addText(token.content);
				break;

			case 'mention': {
				const did = await resolveHandle(token.handle);
				if (did) {
					rt.addMention(token.raw, did);
				} else {
					rt.addText(token.raw);
				}
				break;
			}

			case 'topic':
				rt.addTag(token.name);
				break;

			case 'autolink':
				rt.addLink(token.url, token.url);
				break;

			case 'link':
				// flatten children to text
				const text = flattenToText(token.children);
				rt.addLink(text, token.url);
				break;

			case 'escape':
				rt.addText(token.escaped);
				break;

			// formatting tokens (strong, emphasis, etc.) don't map to facets
			// so just extract their text content
			case 'strong':
			case 'emphasis':
			case 'underline':
			case 'delete':
				rt.addText(flattenToText(token.children));
				break;

			case 'code':
				rt.addText(token.content);
				break;

			case 'emote':
				// handle emotes as needed
				rt.addText(token.raw);
				break;
		}
	}

	return rt;
};

const flattenToText = (tokens: Token[]): string => {
	return tokens
		.map((t) => {
			if ('content' in t) {
				return t.content;
			}
			if ('children' in t) {
				return flattenToText(t.children);
			}
			return t.raw;
		})
		.join('');
};

token types#

type	fields	description
`text`	`content`	plain text
`mention`	`handle`	@mention
`topic`	`name`	#hashtag
`emote`	`name`	:emote:
`autolink`	`url`	bare URL
`link`	`url`, `children`	markdown link with nested tokens
`strong`	`children`	bold
`emphasis`	`children`	_italic_
`underline`	`children`	__underline__
`delete`	`children`	~~strikethrough~~
`code`	`content`	`inline code`
`escape`	`escaped`	backslash escape

all tokens have raw containing the original matched text.