···11+//
22+// GTMNSString+HTML.h
33+// Dealing with NSStrings that contain HTML
44+//
55+// Copyright 2006-2008 Google Inc.
66+//
77+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
88+// use this file except in compliance with the License. You may obtain a copy
99+// of the License at
1010+//
1111+// http://www.apache.org/licenses/LICENSE-2.0
1212+//
1313+// Unless required by applicable law or agreed to in writing, software
1414+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
1515+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
1616+// License for the specific language governing permissions and limitations under
1717+// the License.
1818+//
1919+2020+#import <Foundation/Foundation.h>
2121+2222+/// Utilities for NSStrings containing HTML
2323+@interface NSString (GTMNSStringHTMLAdditions)
2424+2525+/// Get a string where internal characters that need escaping for HTML are escaped
2626+//
2727+/// For example, '&' become '&'. This will only cover characters from table
2828+/// A.2.2 of http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
2929+/// which is what you want for a unicode encoded webpage. If you have a ascii
3030+/// or non-encoded webpage, please use stringByEscapingAsciiHTML which will
3131+/// encode all characters.
3232+///
3333+/// For obvious reasons this call is only safe once.
3434+//
3535+// Returns:
3636+// Autoreleased NSString
3737+//
3838+- (NSString *)gtm_stringByEscapingForHTML;
3939+4040+/// Get a string where internal characters that need escaping for HTML are escaped
4141+//
4242+/// For example, '&' become '&'
4343+/// All non-mapped characters (unicode that don't have a &keyword; mapping)
4444+/// will be converted to the appropriate &#xxx; value. If your webpage is
4545+/// unicode encoded (UTF16 or UTF8) use stringByEscapingHTML instead as it is
4646+/// faster, and produces less bloated and more readable HTML (as long as you
4747+/// are using a unicode compliant HTML reader).
4848+///
4949+/// For obvious reasons this call is only safe once.
5050+//
5151+// Returns:
5252+// Autoreleased NSString
5353+//
5454+- (NSString *)gtm_stringByEscapingForAsciiHTML;
5555+5656+/// Get a string where internal characters that are escaped for HTML are unescaped
5757+//
5858+/// For example, '&' becomes '&'
5959+/// Handles   and 2 cases as well
6060+///
6161+// Returns:
6262+// Autoreleased NSString
6363+//
6464+- (NSString *)gtm_stringByUnescapingFromHTML;
6565+6666+@end
···11+//
22+// NSString+HTML.h
33+// MWFeedParser
44+//
55+// Copyright (c) 2010 Michael Waterfall
66+//
77+// Permission is hereby granted, free of charge, to any person obtaining a copy
88+// of this software and associated documentation files (the "Software"), to deal
99+// in the Software without restriction, including without limitation the rights
1010+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1111+// copies of the Software, and to permit persons to whom the Software is
1212+// furnished to do so, subject to the following conditions:
1313+//
1414+// 1. The above copyright notice and this permission notice shall be included
1515+// in all copies or substantial portions of the Software.
1616+//
1717+// 2. This Software cannot be used to archive or collect data such as (but not
1818+// limited to) that of events, news, experiences and activities, for the
1919+// purpose of any concept relating to diary/journal keeping.
2020+//
2121+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2222+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2323+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2424+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2525+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2626+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2727+// THE SOFTWARE.
2828+//
2929+3030+#import <Foundation/Foundation.h>
3131+3232+// Dependant upon GTMNSString+HTML
3333+3434+@interface NSString (HTML)
3535+3636+// Strips HTML tags & comments, removes extra whitespace and decodes HTML character entities.
3737+- (NSString *)stringByConvertingHTMLToPlainText;
3838+3939+// Decode all HTML entities using GTM.
4040+- (NSString *)stringByDecodingHTMLEntities;
4141+4242+// Encode all HTML entities using GTM.
4343+- (NSString *)stringByEncodingHTMLEntities;
4444+4545+// Minimal unicode encoding will only cover characters from table
4646+// A.2.2 of http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
4747+// which is what you want for a unicode encoded webpage.
4848+- (NSString *)stringByEncodingHTMLEntities:(BOOL)isUnicode;
4949+5050+// Replace newlines with <br /> tags.
5151+- (NSString *)stringWithNewLinesAsBRs;
5252+5353+// Remove newlines and white space from string.
5454+- (NSString *)stringByRemovingNewLinesAndWhitespace;
5555+5656+// Wrap plain URLs in <a href="..." class="linkified">...</a>
5757+// - Ignores URLs inside tags (any URL beginning with =")
5858+// - HTTP & HTTPS schemes only
5959+// - Only works in iOS 4+ as we use NSRegularExpression (returns self if not supported so be careful with NSMutableStrings)
6060+// - Expression: (?<!=")\b((http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)
6161+// - Adapted from http://regexlib.com/REDetails.aspx?regexp_id=96
6262+- (NSString *)stringByLinkifyingURLs;
6363+6464+// DEPRECIATED - Please use NSString stringByConvertingHTMLToPlainText
6565+- (NSString *)stringByStrippingTags __attribute__((deprecated));
6666+6767+@end
+342
NSString+HTML.m
···11+//
22+// NSString+HTML.m
33+// MWFeedParser
44+//
55+// Copyright (c) 2010 Michael Waterfall
66+//
77+// Permission is hereby granted, free of charge, to any person obtaining a copy
88+// of this software and associated documentation files (the "Software"), to deal
99+// in the Software without restriction, including without limitation the rights
1010+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1111+// copies of the Software, and to permit persons to whom the Software is
1212+// furnished to do so, subject to the following conditions:
1313+//
1414+// 1. The above copyright notice and this permission notice shall be included
1515+// in all copies or substantial portions of the Software.
1616+//
1717+// 2. This Software cannot be used to archive or collect data such as (but not
1818+// limited to) that of events, news, experiences and activities, for the
1919+// purpose of any concept relating to diary/journal keeping.
2020+//
2121+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2222+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2323+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2424+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2525+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2626+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2727+// THE SOFTWARE.
2828+//
2929+3030+#import "NSString+HTML.h"
3131+#import "GTMNSString+HTML.h"
3232+3333+@implementation NSString (HTML)
3434+3535+#pragma mark - Instance Methods
3636+3737+- (NSString *)stringByConvertingHTMLToPlainText {
3838+3939+ // Pool
4040+ NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
4141+4242+ // Character sets
4343+ NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
4444+ NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
4545+ NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"];
4646+4747+ // Scan and find all tags
4848+ NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length];
4949+ NSScanner *scanner = [[NSScanner alloc] initWithString:self];
5050+ [scanner setCharactersToBeSkipped:nil];
5151+ [scanner setCaseSensitive:YES];
5252+ NSString *str = nil, *tagName = nil;
5353+ BOOL dontReplaceTagWithSpace = NO;
5454+ do {
5555+5656+ // Scan up to the start of a tag or whitespace
5757+ if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) {
5858+ [result appendString:str];
5959+ str = nil; // reset
6060+ }
6161+6262+ // Check if we've stopped at a tag/comment or whitespace
6363+ if ([scanner scanString:@"<" intoString:NULL]) {
6464+6565+ // Stopped at a comment or tag
6666+ if ([scanner scanString:@"!--" intoString:NULL]) {
6767+6868+ // Comment
6969+ [scanner scanUpToString:@"-->" intoString:NULL];
7070+ [scanner scanString:@"-->" intoString:NULL];
7171+7272+ } else {
7373+7474+ // Tag - remove and replace with space unless it's
7575+ // a closing inline tag then dont replace with a space
7676+ if ([scanner scanString:@"/" intoString:NULL]) {
7777+7878+ // Closing tag - replace with space unless it's inline
7979+ tagName = nil; dontReplaceTagWithSpace = NO;
8080+ if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) {
8181+ tagName = [tagName lowercaseString];
8282+ dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] ||
8383+ [tagName isEqualToString:@"b"] ||
8484+ [tagName isEqualToString:@"i"] ||
8585+ [tagName isEqualToString:@"q"] ||
8686+ [tagName isEqualToString:@"span"] ||
8787+ [tagName isEqualToString:@"em"] ||
8888+ [tagName isEqualToString:@"strong"] ||
8989+ [tagName isEqualToString:@"cite"] ||
9090+ [tagName isEqualToString:@"abbr"] ||
9191+ [tagName isEqualToString:@"acronym"] ||
9292+ [tagName isEqualToString:@"label"]);
9393+ }
9494+9595+ // Replace tag with string unless it was an inline
9696+ if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "];
9797+9898+ }
9999+100100+ // Scan past tag
101101+ [scanner scanUpToString:@">" intoString:NULL];
102102+ [scanner scanString:@">" intoString:NULL];
103103+104104+ }
105105+106106+ } else {
107107+108108+ // Stopped at whitespace - replace all whitespace and newlines with a space
109109+ if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
110110+ if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result
111111+ }
112112+113113+ }
114114+115115+ } while (![scanner isAtEnd]);
116116+117117+ // Cleanup
118118+ [scanner release];
119119+120120+ // Decode HTML entities and return
121121+ NSString *retString = [[result stringByDecodingHTMLEntities] retain];
122122+ [result release];
123123+124124+ // Drain
125125+ [pool drain];
126126+127127+ // Return
128128+ return [retString autorelease];
129129+130130+}
131131+132132+- (NSString *)stringByDecodingHTMLEntities {
133133+ // Can return self so create new string if we're a mutable string
134134+ return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]];
135135+}
136136+137137+138138+- (NSString *)stringByEncodingHTMLEntities {
139139+ // Can return self so create new string if we're a mutable string
140140+ return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]];
141141+}
142142+143143+- (NSString *)stringByEncodingHTMLEntities:(BOOL)isUnicode {
144144+ // Can return self so create new string if we're a mutable string
145145+ return [NSString stringWithString:(isUnicode ? [self gtm_stringByEscapingForHTML] : [self gtm_stringByEscapingForAsciiHTML])];
146146+}
147147+148148+- (NSString *)stringWithNewLinesAsBRs {
149149+150150+ // Pool
151151+ NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
152152+153153+ // Strange New lines:
154154+ // Next Line, U+0085
155155+ // Form Feed, U+000C
156156+ // Line Separator, U+2028
157157+ // Paragraph Separator, U+2029
158158+159159+ // Scanner
160160+ NSScanner *scanner = [[NSScanner alloc] initWithString:self];
161161+ [scanner setCharactersToBeSkipped:nil];
162162+ NSMutableString *result = [[NSMutableString alloc] init];
163163+ NSString *temp;
164164+ NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString:
165165+ [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
166166+ // Scan
167167+ do {
168168+169169+ // Get non new line characters
170170+ temp = nil;
171171+ [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp];
172172+ if (temp) [result appendString:temp];
173173+ temp = nil;
174174+175175+ // Add <br /> s
176176+ if ([scanner scanString:@"\r\n" intoString:nil]) {
177177+178178+ // Combine \r\n into just 1 <br />
179179+ [result appendString:@"<br />"];
180180+181181+ } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) {
182182+183183+ // Scan other new line characters and add <br /> s
184184+ if (temp) {
185185+ for (int i = 0; i < temp.length; i++) {
186186+ [result appendString:@"<br />"];
187187+ }
188188+ }
189189+190190+ }
191191+192192+ } while (![scanner isAtEnd]);
193193+194194+ // Cleanup & return
195195+ [scanner release];
196196+ NSString *retString = [[NSString stringWithString:result] retain];
197197+ [result release];
198198+199199+ // Drain
200200+ [pool drain];
201201+202202+ // Return
203203+ return [retString autorelease];
204204+205205+}
206206+207207+- (NSString *)stringByRemovingNewLinesAndWhitespace {
208208+209209+ // Pool
210210+ NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
211211+212212+ // Strange New lines:
213213+ // Next Line, U+0085
214214+ // Form Feed, U+000C
215215+ // Line Separator, U+2028
216216+ // Paragraph Separator, U+2029
217217+218218+ // Scanner
219219+ NSScanner *scanner = [[NSScanner alloc] initWithString:self];
220220+ [scanner setCharactersToBeSkipped:nil];
221221+ NSMutableString *result = [[NSMutableString alloc] init];
222222+ NSString *temp;
223223+ NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:
224224+ [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
225225+ // Scan
226226+ while (![scanner isAtEnd]) {
227227+228228+ // Get non new line or whitespace characters
229229+ temp = nil;
230230+ [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp];
231231+ if (temp) [result appendString:temp];
232232+233233+ // Replace with a space
234234+ if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
235235+ if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result
236236+ [result appendString:@" "];
237237+ }
238238+239239+ }
240240+241241+ // Cleanup
242242+ [scanner release];
243243+244244+ // Return
245245+ NSString *retString = [[NSString stringWithString:result] retain];
246246+ [result release];
247247+248248+ // Drain
249249+ [pool drain];
250250+251251+ // Return
252252+ return [retString autorelease];
253253+254254+}
255255+256256+- (NSString *)stringByLinkifyingURLs {
257257+ if (!NSClassFromString(@"NSRegularExpression")) return self;
258258+ NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
259259+ NSString *pattern = @"(?<!=\")\\b((http|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%%&:/~\\+#]*[\\w\\-\\@?^=%%&/~\\+#])?)";
260260+ NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:pattern options:0 error:nil];
261261+ NSString *modifiedString = [[regex stringByReplacingMatchesInString:self options:0 range:NSMakeRange(0, [self length])
262262+ withTemplate:@"<a href=\"$1\" class=\"linkified\">$1</a>"] retain];
263263+ [pool drain];
264264+ return [modifiedString autorelease];
265265+}
266266+267267+- (NSString *)stringByStrippingTags {
268268+269269+ // Pool
270270+ NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
271271+272272+ // Find first & and short-cut if we can
273273+ NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location;
274274+ if (ampIndex == NSNotFound) {
275275+ return [NSString stringWithString:self]; // return copy of string as no tags found
276276+ }
277277+278278+ // Scan and find all tags
279279+ NSScanner *scanner = [NSScanner scannerWithString:self];
280280+ [scanner setCharactersToBeSkipped:nil];
281281+ NSMutableSet *tags = [[NSMutableSet alloc] init];
282282+ NSString *tag;
283283+ do {
284284+285285+ // Scan up to <
286286+ tag = nil;
287287+ [scanner scanUpToString:@"<" intoString:NULL];
288288+ [scanner scanUpToString:@">" intoString:&tag];
289289+290290+ // Add to set
291291+ if (tag) {
292292+ NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag];
293293+ [tags addObject:t];
294294+ [t release];
295295+ }
296296+297297+ } while (![scanner isAtEnd]);
298298+299299+ // Strings
300300+ NSMutableString *result = [[NSMutableString alloc] initWithString:self];
301301+ NSString *finalString;
302302+303303+ // Replace tags
304304+ NSString *replacement;
305305+ for (NSString *t in tags) {
306306+307307+ // Replace tag with space unless it's an inline element
308308+ replacement = @" ";
309309+ if ([t isEqualToString:@"<a>"] ||
310310+ [t isEqualToString:@"</a>"] ||
311311+ [t isEqualToString:@"<span>"] ||
312312+ [t isEqualToString:@"</span>"] ||
313313+ [t isEqualToString:@"<strong>"] ||
314314+ [t isEqualToString:@"</strong>"] ||
315315+ [t isEqualToString:@"<em>"] ||
316316+ [t isEqualToString:@"</em>"]) {
317317+ replacement = @"";
318318+ }
319319+320320+ // Replace
321321+ [result replaceOccurrencesOfString:t
322322+ withString:replacement
323323+ options:NSLiteralSearch
324324+ range:NSMakeRange(0, result.length)];
325325+ }
326326+327327+ // Remove multi-spaces and line breaks
328328+ finalString = [[result stringByRemovingNewLinesAndWhitespace] retain];
329329+330330+ // Cleanup
331331+ [result release];
332332+ [tags release];
333333+334334+ // Drain
335335+ [pool drain];
336336+337337+ // Return
338338+ return [finalString autorelease];
339339+340340+}
341341+342342+@end