Markdown -> Semble importer
1// Isolated from Semble repo to keep Citoid mapping auditable and easy to update.
2
3export interface CitoidCreator {
4 firstName?: string;
5 lastName?: string;
6 creatorType?: string;
7 name?: string;
8}
9
10export interface CitoidResponse {
11 itemType?: string;
12 creators?: CitoidCreator[];
13 title?: string;
14 date?: string;
15 abstractNote?: string;
16 publicationTitle?: string;
17 websiteTitle?: string;
18 blogTitle?: string;
19 forumTitle?: string;
20 repository?: string;
21 libraryCatalog?: string;
22 institution?: string;
23 university?: string;
24 publisher?: string;
25 proceedingsTitle?: string;
26 conferenceName?: string;
27 series?: string;
28 seriesTitle?: string;
29 network?: string;
30 programTitle?: string;
31 distributor?: string;
32 studio?: string;
33 label?: string;
34 DOI?: string;
35 ISBN?: string;
36 dateDecided?: string;
37 dateEnacted?: string;
38 issueDate?: string;
39 filingDate?: string;
40}
41
42export enum SembleUrlType {
43 LINK = "link",
44 ARTICLE = "article",
45 BOOK = "book",
46 RESEARCH = "research",
47 AUDIO = "audio",
48 VIDEO = "video",
49 SOCIAL = "social",
50 SOFTWARE = "software"
51}
52
53export interface SembleUrlMetadata {
54 title?: string;
55 description?: string;
56 author?: string;
57 publishedDate?: Date;
58 siteName?: string;
59 type?: SembleUrlType;
60 doi?: string;
61 isbn?: string;
62}
63
64export enum CitoidUrlTypes {
65 ARTWORK = "artwork",
66 AUDIO_RECORDING = "audioRecording",
67 BILL = "bill",
68 BLOG_POST = "blogPost",
69 BOOK = "book",
70 BOOK_SECTION = "bookSection",
71 CASE = "case",
72 CONFERENCE_PAPER = "conferencePaper",
73 DATASET = "dataset",
74 DICTIONARY_ENTRY = "dictionaryEntry",
75 DOCUMENT = "document",
76 EMAIL = "email",
77 ENCYCLOPEDIA_ARTICLE = "encyclopediaArticle",
78 FILM = "film",
79 FORUM_POST = "forumPost",
80 HEARING = "hearing",
81 INSTANT_MESSAGE = "instantMessage",
82 INTERVIEW = "interview",
83 JOURNAL_ARTICLE = "journalArticle",
84 LETTER = "letter",
85 MAGAZINE_ARTICLE = "magazineArticle",
86 MANUSCRIPT = "manuscript",
87 MAP = "map",
88 NEWSPAPER_ARTICLE = "newspaperArticle",
89 NOTE = "note",
90 PATENT = "patent",
91 PODCAST = "podcast",
92 PREPRINT = "preprint",
93 PRESENTATION = "presentation",
94 RADIO_BROADCAST = "radioBroadcast",
95 REPORT = "report",
96 COMPUTER_PROGRAM = "computerProgram",
97 STANDARD = "standard",
98 STATUTE = "statute",
99 THESIS = "thesis",
100 TV_BROADCAST = "tvBroadcast",
101 VIDEO_RECORDING = "videoRecording",
102 WEBPAGE = "webpage"
103}
104
105const citoidToSembleType: Record<CitoidUrlTypes, SembleUrlType> = {
106 [CitoidUrlTypes.ARTWORK]: SembleUrlType.LINK,
107 [CitoidUrlTypes.AUDIO_RECORDING]: SembleUrlType.AUDIO,
108 [CitoidUrlTypes.BILL]: SembleUrlType.LINK,
109 [CitoidUrlTypes.BLOG_POST]: SembleUrlType.ARTICLE,
110 [CitoidUrlTypes.BOOK]: SembleUrlType.BOOK,
111 [CitoidUrlTypes.BOOK_SECTION]: SembleUrlType.BOOK,
112 [CitoidUrlTypes.CASE]: SembleUrlType.LINK,
113 [CitoidUrlTypes.CONFERENCE_PAPER]: SembleUrlType.RESEARCH,
114 [CitoidUrlTypes.DATASET]: SembleUrlType.RESEARCH,
115 [CitoidUrlTypes.DICTIONARY_ENTRY]: SembleUrlType.LINK,
116 [CitoidUrlTypes.DOCUMENT]: SembleUrlType.LINK,
117 [CitoidUrlTypes.EMAIL]: SembleUrlType.LINK,
118 [CitoidUrlTypes.ENCYCLOPEDIA_ARTICLE]: SembleUrlType.ARTICLE,
119 [CitoidUrlTypes.FILM]: SembleUrlType.VIDEO,
120 [CitoidUrlTypes.FORUM_POST]: SembleUrlType.SOCIAL,
121 [CitoidUrlTypes.HEARING]: SembleUrlType.LINK,
122 [CitoidUrlTypes.INSTANT_MESSAGE]: SembleUrlType.LINK,
123 [CitoidUrlTypes.INTERVIEW]: SembleUrlType.LINK,
124 [CitoidUrlTypes.JOURNAL_ARTICLE]: SembleUrlType.RESEARCH,
125 [CitoidUrlTypes.LETTER]: SembleUrlType.LINK,
126 [CitoidUrlTypes.MAGAZINE_ARTICLE]: SembleUrlType.ARTICLE,
127 [CitoidUrlTypes.MANUSCRIPT]: SembleUrlType.RESEARCH,
128 [CitoidUrlTypes.MAP]: SembleUrlType.LINK,
129 [CitoidUrlTypes.NEWSPAPER_ARTICLE]: SembleUrlType.ARTICLE,
130 [CitoidUrlTypes.NOTE]: SembleUrlType.LINK,
131 [CitoidUrlTypes.PATENT]: SembleUrlType.LINK,
132 [CitoidUrlTypes.PODCAST]: SembleUrlType.AUDIO,
133 [CitoidUrlTypes.PREPRINT]: SembleUrlType.RESEARCH,
134 [CitoidUrlTypes.PRESENTATION]: SembleUrlType.RESEARCH,
135 [CitoidUrlTypes.RADIO_BROADCAST]: SembleUrlType.AUDIO,
136 [CitoidUrlTypes.REPORT]: SembleUrlType.RESEARCH,
137 [CitoidUrlTypes.COMPUTER_PROGRAM]: SembleUrlType.SOFTWARE,
138 [CitoidUrlTypes.STANDARD]: SembleUrlType.LINK,
139 [CitoidUrlTypes.STATUTE]: SembleUrlType.LINK,
140 [CitoidUrlTypes.THESIS]: SembleUrlType.RESEARCH,
141 [CitoidUrlTypes.TV_BROADCAST]: SembleUrlType.VIDEO,
142 [CitoidUrlTypes.VIDEO_RECORDING]: SembleUrlType.VIDEO,
143 [CitoidUrlTypes.WEBPAGE]: SembleUrlType.LINK
144};
145
146export function mapCitoidUrlType(citoidType?: string): SembleUrlType {
147 if (!citoidType) return SembleUrlType.LINK;
148 const normalizedType = citoidType as CitoidUrlTypes;
149 return citoidToSembleType[normalizedType] || SembleUrlType.LINK;
150}
151
152export function buildSembleMetadataFromCitoid(data: CitoidResponse): SembleUrlMetadata {
153 return {
154 title: data.title,
155 description: data.abstractNote,
156 author: extractAuthors(data.creators),
157 publishedDate: extractPublishedDate(data),
158 siteName: determineSiteName(data),
159 type: mapCitoidUrlType(data.itemType),
160 doi: data.DOI,
161 isbn: data.ISBN
162 };
163}
164
165function extractAuthors(creators?: CitoidCreator[]): string | undefined {
166 if (!creators || creators.length === 0) return undefined;
167
168 const authorCreators = creators.filter(creator => creator.creatorType === "author");
169 if (authorCreators.length > 0) {
170 return joinAuthors(authorCreators);
171 }
172
173 const primaryTypes = [
174 "artist",
175 "performer",
176 "director",
177 "podcaster",
178 "cartographer",
179 "programmer",
180 "presenter",
181 "sponsor",
182 "inventor",
183 "interviewee",
184 "bookAuthor",
185 "editor",
186 "contributor"
187 ];
188
189 for (const type of primaryTypes) {
190 const creatorsByType = creators.filter(entry => entry.creatorType === type);
191 if (creatorsByType.length > 0) {
192 return joinAuthors(creatorsByType);
193 }
194 }
195
196 return joinAuthors(creators);
197}
198
199function joinAuthors(creators: CitoidCreator[]): string | undefined {
200 const formatted = creators.map(formatAuthor).filter(Boolean);
201 if (formatted.length === 0) return undefined;
202 return formatted.join(", ");
203}
204
205function formatAuthor(creator: CitoidCreator): string {
206 if (creator.name) return creator.name;
207 if (creator.firstName && creator.lastName) {
208 return `${creator.firstName} ${creator.lastName}`;
209 }
210 return creator.lastName || creator.firstName || "";
211}
212
213function determineSiteName(data: CitoidResponse): string | undefined {
214 const itemType = data.itemType;
215 switch (itemType) {
216 case CitoidUrlTypes.JOURNAL_ARTICLE:
217 case CitoidUrlTypes.MAGAZINE_ARTICLE:
218 case CitoidUrlTypes.NEWSPAPER_ARTICLE:
219 return data.publicationTitle || data.publisher;
220 case CitoidUrlTypes.BLOG_POST:
221 return data.blogTitle || data.websiteTitle;
222 case CitoidUrlTypes.FORUM_POST:
223 return data.forumTitle;
224 case CitoidUrlTypes.WEBPAGE:
225 return data.websiteTitle;
226 case CitoidUrlTypes.BOOK:
227 case CitoidUrlTypes.BOOK_SECTION:
228 return data.publisher || data.series;
229 case CitoidUrlTypes.CONFERENCE_PAPER:
230 return data.proceedingsTitle || data.conferenceName || data.publisher;
231 case CitoidUrlTypes.THESIS:
232 return data.university || data.institution;
233 case CitoidUrlTypes.REPORT:
234 return data.institution || data.publisher;
235 case CitoidUrlTypes.DATASET:
236 case CitoidUrlTypes.PREPRINT:
237 return data.repository || data.institution;
238 case CitoidUrlTypes.PODCAST:
239 return data.seriesTitle || data.network;
240 case CitoidUrlTypes.TV_BROADCAST:
241 case CitoidUrlTypes.RADIO_BROADCAST:
242 return data.network || data.programTitle;
243 case CitoidUrlTypes.FILM:
244 case CitoidUrlTypes.VIDEO_RECORDING:
245 return data.distributor || data.studio;
246 case CitoidUrlTypes.AUDIO_RECORDING:
247 return data.label || data.publisher;
248 default:
249 return (
250 data.publicationTitle ||
251 data.websiteTitle ||
252 data.blogTitle ||
253 data.forumTitle ||
254 data.repository ||
255 data.libraryCatalog ||
256 data.institution ||
257 data.university ||
258 data.publisher
259 );
260 }
261}
262
263function extractPublishedDate(data: CitoidResponse): Date | undefined {
264 let dateString: string | undefined;
265 switch (data.itemType) {
266 case CitoidUrlTypes.CASE:
267 dateString = data.dateDecided || data.date;
268 break;
269 case CitoidUrlTypes.STATUTE:
270 dateString = data.dateEnacted || data.date;
271 break;
272 case CitoidUrlTypes.PATENT:
273 dateString = data.issueDate || data.filingDate || data.date;
274 break;
275 default:
276 dateString = data.date;
277 break;
278 }
279 return dateString ? parseDate(dateString) : undefined;
280}
281
282function parseDate(dateString: string): Date | undefined {
283 if (/^\d{4}-\d{2}-\d{2}$/.test(dateString)) {
284 const parsed = new Date(`${dateString}T00:00:00.000Z`);
285 return Number.isNaN(parsed.getTime()) ? undefined : parsed;
286 }
287 if (/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/.test(dateString)) {
288 const parsed = new Date(dateString);
289 return Number.isNaN(parsed.getTime()) ? undefined : parsed;
290 }
291 if (/^\d{4}$/.test(dateString)) {
292 const parsed = new Date(`${dateString}-01-01T00:00:00.000Z`);
293 return Number.isNaN(parsed.getTime()) ? undefined : parsed;
294 }
295 if (/^\d{4}-\d{2}$/.test(dateString)) {
296 const parsed = new Date(`${dateString}-01T00:00:00.000Z`);
297 return Number.isNaN(parsed.getTime()) ? undefined : parsed;
298 }
299 const parsed = new Date(dateString);
300 return Number.isNaN(parsed.getTime()) ? undefined : parsed;
301}