Markdown -> Semble importer
at canon 301 lines 9.5 kB view raw
1// Isolated from Semble repo to keep Citoid mapping auditable and easy to update. 2 3export interface CitoidCreator { 4 firstName?: string; 5 lastName?: string; 6 creatorType?: string; 7 name?: string; 8} 9 10export interface CitoidResponse { 11 itemType?: string; 12 creators?: CitoidCreator[]; 13 title?: string; 14 date?: string; 15 abstractNote?: string; 16 publicationTitle?: string; 17 websiteTitle?: string; 18 blogTitle?: string; 19 forumTitle?: string; 20 repository?: string; 21 libraryCatalog?: string; 22 institution?: string; 23 university?: string; 24 publisher?: string; 25 proceedingsTitle?: string; 26 conferenceName?: string; 27 series?: string; 28 seriesTitle?: string; 29 network?: string; 30 programTitle?: string; 31 distributor?: string; 32 studio?: string; 33 label?: string; 34 DOI?: string; 35 ISBN?: string; 36 dateDecided?: string; 37 dateEnacted?: string; 38 issueDate?: string; 39 filingDate?: string; 40} 41 42export enum SembleUrlType { 43 LINK = "link", 44 ARTICLE = "article", 45 BOOK = "book", 46 RESEARCH = "research", 47 AUDIO = "audio", 48 VIDEO = "video", 49 SOCIAL = "social", 50 SOFTWARE = "software" 51} 52 53export interface SembleUrlMetadata { 54 title?: string; 55 description?: string; 56 author?: string; 57 publishedDate?: Date; 58 siteName?: string; 59 type?: SembleUrlType; 60 doi?: string; 61 isbn?: string; 62} 63 64export enum CitoidUrlTypes { 65 ARTWORK = "artwork", 66 AUDIO_RECORDING = "audioRecording", 67 BILL = "bill", 68 BLOG_POST = "blogPost", 69 BOOK = "book", 70 BOOK_SECTION = "bookSection", 71 CASE = "case", 72 CONFERENCE_PAPER = "conferencePaper", 73 DATASET = "dataset", 74 DICTIONARY_ENTRY = "dictionaryEntry", 75 DOCUMENT = "document", 76 EMAIL = "email", 77 ENCYCLOPEDIA_ARTICLE = "encyclopediaArticle", 78 FILM = "film", 79 FORUM_POST = "forumPost", 80 HEARING = "hearing", 81 INSTANT_MESSAGE = "instantMessage", 82 INTERVIEW = "interview", 83 JOURNAL_ARTICLE = "journalArticle", 84 LETTER = "letter", 85 MAGAZINE_ARTICLE = "magazineArticle", 86 MANUSCRIPT = "manuscript", 87 MAP = "map", 88 NEWSPAPER_ARTICLE = "newspaperArticle", 89 NOTE = "note", 90 PATENT = "patent", 91 PODCAST = "podcast", 92 PREPRINT = "preprint", 93 PRESENTATION = "presentation", 94 RADIO_BROADCAST = "radioBroadcast", 95 REPORT = "report", 96 COMPUTER_PROGRAM = "computerProgram", 97 STANDARD = "standard", 98 STATUTE = "statute", 99 THESIS = "thesis", 100 TV_BROADCAST = "tvBroadcast", 101 VIDEO_RECORDING = "videoRecording", 102 WEBPAGE = "webpage" 103} 104 105const citoidToSembleType: Record<CitoidUrlTypes, SembleUrlType> = { 106 [CitoidUrlTypes.ARTWORK]: SembleUrlType.LINK, 107 [CitoidUrlTypes.AUDIO_RECORDING]: SembleUrlType.AUDIO, 108 [CitoidUrlTypes.BILL]: SembleUrlType.LINK, 109 [CitoidUrlTypes.BLOG_POST]: SembleUrlType.ARTICLE, 110 [CitoidUrlTypes.BOOK]: SembleUrlType.BOOK, 111 [CitoidUrlTypes.BOOK_SECTION]: SembleUrlType.BOOK, 112 [CitoidUrlTypes.CASE]: SembleUrlType.LINK, 113 [CitoidUrlTypes.CONFERENCE_PAPER]: SembleUrlType.RESEARCH, 114 [CitoidUrlTypes.DATASET]: SembleUrlType.RESEARCH, 115 [CitoidUrlTypes.DICTIONARY_ENTRY]: SembleUrlType.LINK, 116 [CitoidUrlTypes.DOCUMENT]: SembleUrlType.LINK, 117 [CitoidUrlTypes.EMAIL]: SembleUrlType.LINK, 118 [CitoidUrlTypes.ENCYCLOPEDIA_ARTICLE]: SembleUrlType.ARTICLE, 119 [CitoidUrlTypes.FILM]: SembleUrlType.VIDEO, 120 [CitoidUrlTypes.FORUM_POST]: SembleUrlType.SOCIAL, 121 [CitoidUrlTypes.HEARING]: SembleUrlType.LINK, 122 [CitoidUrlTypes.INSTANT_MESSAGE]: SembleUrlType.LINK, 123 [CitoidUrlTypes.INTERVIEW]: SembleUrlType.LINK, 124 [CitoidUrlTypes.JOURNAL_ARTICLE]: SembleUrlType.RESEARCH, 125 [CitoidUrlTypes.LETTER]: SembleUrlType.LINK, 126 [CitoidUrlTypes.MAGAZINE_ARTICLE]: SembleUrlType.ARTICLE, 127 [CitoidUrlTypes.MANUSCRIPT]: SembleUrlType.RESEARCH, 128 [CitoidUrlTypes.MAP]: SembleUrlType.LINK, 129 [CitoidUrlTypes.NEWSPAPER_ARTICLE]: SembleUrlType.ARTICLE, 130 [CitoidUrlTypes.NOTE]: SembleUrlType.LINK, 131 [CitoidUrlTypes.PATENT]: SembleUrlType.LINK, 132 [CitoidUrlTypes.PODCAST]: SembleUrlType.AUDIO, 133 [CitoidUrlTypes.PREPRINT]: SembleUrlType.RESEARCH, 134 [CitoidUrlTypes.PRESENTATION]: SembleUrlType.RESEARCH, 135 [CitoidUrlTypes.RADIO_BROADCAST]: SembleUrlType.AUDIO, 136 [CitoidUrlTypes.REPORT]: SembleUrlType.RESEARCH, 137 [CitoidUrlTypes.COMPUTER_PROGRAM]: SembleUrlType.SOFTWARE, 138 [CitoidUrlTypes.STANDARD]: SembleUrlType.LINK, 139 [CitoidUrlTypes.STATUTE]: SembleUrlType.LINK, 140 [CitoidUrlTypes.THESIS]: SembleUrlType.RESEARCH, 141 [CitoidUrlTypes.TV_BROADCAST]: SembleUrlType.VIDEO, 142 [CitoidUrlTypes.VIDEO_RECORDING]: SembleUrlType.VIDEO, 143 [CitoidUrlTypes.WEBPAGE]: SembleUrlType.LINK 144}; 145 146export function mapCitoidUrlType(citoidType?: string): SembleUrlType { 147 if (!citoidType) return SembleUrlType.LINK; 148 const normalizedType = citoidType as CitoidUrlTypes; 149 return citoidToSembleType[normalizedType] || SembleUrlType.LINK; 150} 151 152export function buildSembleMetadataFromCitoid(data: CitoidResponse): SembleUrlMetadata { 153 return { 154 title: data.title, 155 description: data.abstractNote, 156 author: extractAuthors(data.creators), 157 publishedDate: extractPublishedDate(data), 158 siteName: determineSiteName(data), 159 type: mapCitoidUrlType(data.itemType), 160 doi: data.DOI, 161 isbn: data.ISBN 162 }; 163} 164 165function extractAuthors(creators?: CitoidCreator[]): string | undefined { 166 if (!creators || creators.length === 0) return undefined; 167 168 const authorCreators = creators.filter(creator => creator.creatorType === "author"); 169 if (authorCreators.length > 0) { 170 return joinAuthors(authorCreators); 171 } 172 173 const primaryTypes = [ 174 "artist", 175 "performer", 176 "director", 177 "podcaster", 178 "cartographer", 179 "programmer", 180 "presenter", 181 "sponsor", 182 "inventor", 183 "interviewee", 184 "bookAuthor", 185 "editor", 186 "contributor" 187 ]; 188 189 for (const type of primaryTypes) { 190 const creatorsByType = creators.filter(entry => entry.creatorType === type); 191 if (creatorsByType.length > 0) { 192 return joinAuthors(creatorsByType); 193 } 194 } 195 196 return joinAuthors(creators); 197} 198 199function joinAuthors(creators: CitoidCreator[]): string | undefined { 200 const formatted = creators.map(formatAuthor).filter(Boolean); 201 if (formatted.length === 0) return undefined; 202 return formatted.join(", "); 203} 204 205function formatAuthor(creator: CitoidCreator): string { 206 if (creator.name) return creator.name; 207 if (creator.firstName && creator.lastName) { 208 return `${creator.firstName} ${creator.lastName}`; 209 } 210 return creator.lastName || creator.firstName || ""; 211} 212 213function determineSiteName(data: CitoidResponse): string | undefined { 214 const itemType = data.itemType; 215 switch (itemType) { 216 case CitoidUrlTypes.JOURNAL_ARTICLE: 217 case CitoidUrlTypes.MAGAZINE_ARTICLE: 218 case CitoidUrlTypes.NEWSPAPER_ARTICLE: 219 return data.publicationTitle || data.publisher; 220 case CitoidUrlTypes.BLOG_POST: 221 return data.blogTitle || data.websiteTitle; 222 case CitoidUrlTypes.FORUM_POST: 223 return data.forumTitle; 224 case CitoidUrlTypes.WEBPAGE: 225 return data.websiteTitle; 226 case CitoidUrlTypes.BOOK: 227 case CitoidUrlTypes.BOOK_SECTION: 228 return data.publisher || data.series; 229 case CitoidUrlTypes.CONFERENCE_PAPER: 230 return data.proceedingsTitle || data.conferenceName || data.publisher; 231 case CitoidUrlTypes.THESIS: 232 return data.university || data.institution; 233 case CitoidUrlTypes.REPORT: 234 return data.institution || data.publisher; 235 case CitoidUrlTypes.DATASET: 236 case CitoidUrlTypes.PREPRINT: 237 return data.repository || data.institution; 238 case CitoidUrlTypes.PODCAST: 239 return data.seriesTitle || data.network; 240 case CitoidUrlTypes.TV_BROADCAST: 241 case CitoidUrlTypes.RADIO_BROADCAST: 242 return data.network || data.programTitle; 243 case CitoidUrlTypes.FILM: 244 case CitoidUrlTypes.VIDEO_RECORDING: 245 return data.distributor || data.studio; 246 case CitoidUrlTypes.AUDIO_RECORDING: 247 return data.label || data.publisher; 248 default: 249 return ( 250 data.publicationTitle || 251 data.websiteTitle || 252 data.blogTitle || 253 data.forumTitle || 254 data.repository || 255 data.libraryCatalog || 256 data.institution || 257 data.university || 258 data.publisher 259 ); 260 } 261} 262 263function extractPublishedDate(data: CitoidResponse): Date | undefined { 264 let dateString: string | undefined; 265 switch (data.itemType) { 266 case CitoidUrlTypes.CASE: 267 dateString = data.dateDecided || data.date; 268 break; 269 case CitoidUrlTypes.STATUTE: 270 dateString = data.dateEnacted || data.date; 271 break; 272 case CitoidUrlTypes.PATENT: 273 dateString = data.issueDate || data.filingDate || data.date; 274 break; 275 default: 276 dateString = data.date; 277 break; 278 } 279 return dateString ? parseDate(dateString) : undefined; 280} 281 282function parseDate(dateString: string): Date | undefined { 283 if (/^\d{4}-\d{2}-\d{2}$/.test(dateString)) { 284 const parsed = new Date(`${dateString}T00:00:00.000Z`); 285 return Number.isNaN(parsed.getTime()) ? undefined : parsed; 286 } 287 if (/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/.test(dateString)) { 288 const parsed = new Date(dateString); 289 return Number.isNaN(parsed.getTime()) ? undefined : parsed; 290 } 291 if (/^\d{4}$/.test(dateString)) { 292 const parsed = new Date(`${dateString}-01-01T00:00:00.000Z`); 293 return Number.isNaN(parsed.getTime()) ? undefined : parsed; 294 } 295 if (/^\d{4}-\d{2}$/.test(dateString)) { 296 const parsed = new Date(`${dateString}-01T00:00:00.000Z`); 297 return Number.isNaN(parsed.getTime()) ? undefined : parsed; 298 } 299 const parsed = new Date(dateString); 300 return Number.isNaN(parsed.getTime()) ? undefined : parsed; 301}