/* Written by Håkan Waara (hakan@konstochvanligasaker.se), June 2007. You may only use LinkScraper under the terms of the GPLv2 license. See Get the latest version (using Subversion or your favorite web browser) at https://svn.konstochvanligasaker.se/general */ #import "LinkScraper.h" extern "C" { // make sure gcc doesn't try to compile this as C++ #include "html-parse.h" } @interface LinkScraper (Private) - (void)parseHTML:(NSDictionary *)args; + (NSString *)plaintextifyHTML:(NSString *)chunk; @end struct parsing_context { // the dict we'll populate with links => names NSMutableDictionary *links; // suffix filter (e.g., ".txt" for only links ending in .txt) NSString *filter; // the URL we're parsing from NSURL *documentURL; /* internal state variables */ // internal variable used to track the last seen tag, so // we can get the contents of the last ... pair. char *_lastStartTagEndPos; // whether the last .. pair has other nested tags. bool _lastStartTagContainsOtherTags; // the last URL we added to the links dict. NSString *_lastAddedURL; // base URL (if any) as specified in the NSURL *_baseURL; parsing_context(NSMutableDictionary *_links, NSString *_filter, NSURL *_documentURL) { links = [_links retain]; filter = [_filter retain]; documentURL = [_documentURL retain]; _lastStartTagEndPos = NULL; _lastStartTagContainsOtherTags = false; _lastAddedURL = nil; _baseURL = nil; } ~parsing_context() { [links release]; [filter release]; [documentURL release]; [_lastAddedURL release]; [_baseURL release]; } }; typedef struct parsing_context parsing_context; // XXX: could this be static even though it is used from threads? void parse_links(struct taginfo *tag, parsing_context *ctx); @implementation LinkScraper - (id)init { if ((self = [super init])) { downloadObjects = [NSMutableArray new]; downloadPointersToSavePaths = [NSMutableDictionary new]; } return self; } - (id)initWithURL:(NSString *)theURL delegate:(id)theDelegate { if ((self = [self init])) { [self setDelegate:theDelegate]; if (theURL) { [self scrapeURL:theURL]; } } return self; } - (void)dealloc { delegate = nil; // stop all ongoing downloads NSEnumerator *e = [downloadObjects objectEnumerator]; NSURLDownload *currentDownload = nil; while ((currentDownload = [e nextObject])) { [currentDownload cancel]; } [downloadObjects release]; [downloadPointersToSavePaths release]; [suffixFilter release]; [super dealloc]; } - (void)setDelegate:(id)theDelegate { delegate = theDelegate; } - (void)scrapeURL:(NSString *)url { // create request NSURLRequest *theRequest = [NSURLRequest requestWithURL:[NSURL URLWithString:url] cachePolicy:NSURLRequestUseProtocolCachePolicy timeoutInterval:20.0]; // start downloading at once NSURLDownload *download = [[[NSURLDownload alloc] initWithRequest:theRequest delegate:self] autorelease]; if (download) { // create a unique filename char uniqueFilename[30] = "/tmp/fooXXXXXXXX"; mktemp(uniqueFilename); [download setDestination:[NSString stringWithUTF8String:uniqueFilename] allowOverwrite:NO]; [downloadObjects addObject:download]; } else { if ([delegate respondsToSelector:@selector(linkScraperDidFail:)]) [delegate linkScraperDidFail:self]; } } - (void)setSuffixFilter:(NSString *)filter { [suffixFilter autorelease]; suffixFilter = [filter copy]; } #pragma mark - - (void)download:(NSURLDownload *)theDownload didCreateDestination:(NSString *)path { [downloadPointersToSavePaths setObject:path forKey:[NSValue valueWithPointer:theDownload]]; } - (void)download:(NSURLDownload *)theDownload didFailWithError:(NSError *)error { // get rid of our references to the download object NSValue *downloadKey = [NSValue valueWithPointer:theDownload]; [downloadPointersToSavePaths removeObjectForKey:downloadKey]; NSLog(@"Download failed! Error - %@ %@", [error localizedDescription], [[error userInfo] objectForKey:NSErrorFailingURLStringKey]); if ([delegate respondsToSelector:@selector(linkScraperDidFail:)]) [delegate linkScraperDidFail:self]; } - (void)downloadDidFinish:(NSURLDownload *)theDownload { // all data needed for the new parsing thread to work is copied here, so we can access it safely // without the need for (and overhead of) locking. // the thread owns these objects, and is responsible for releasing them. // remove the download from our download list, and get a copy of the path where it saved the HTML file NSValue *downloadPtr = [NSValue valueWithPointer:theDownload]; NSString *savePathCopy = [[downloadPointersToSavePaths objectForKey:downloadPtr] copy]; [downloadPointersToSavePaths removeObjectForKey:downloadPtr]; [downloadObjects removeObject:theDownload]; // get a copy of the original URL NSURL *urlCopy = [[[theDownload request] URL] copy]; // get a copy of the suffix filter NSString *suffixFilterCopy = [suffixFilter copy]; // get the current runloop, so we can call the delegate methods on the right thread later, if // we were called from another thread. Note, CFRunLoopRef is not toll-free bridged to NSRunLoop. CFRunLoopRef currentLoop = CFRunLoopGetCurrent(); NSDictionary *args = [NSDictionary dictionaryWithObjectsAndKeys:urlCopy, @"url", savePathCopy, @"path", suffixFilterCopy, @"filter", currentLoop, @"runloop", nil]; // spin off the new thread! [NSThread detachNewThreadSelector:@selector(parseHTML:) toTarget:self withObject:args]; } #pragma mark - // This method only accesses delegate - no other instance variables (important for thread safety). - (void)parseHTML:(NSDictionary *)args { NSAutoreleasePool *pool = [NSAutoreleasePool new]; // the thread owns these objects, we'll need to reduce the retain count. NSURL *documentURL = [[args objectForKey:@"url"] autorelease]; NSString *path = [[args objectForKey:@"path"] autorelease]; NSString *filter = [[args objectForKey:@"filter"] autorelease]; CFRunLoopRef callingRunloop = (CFRunLoopRef)[args objectForKey:@"runloop"]; NSData *data = [[NSFileHandle fileHandleForReadingAtPath:path] readDataToEndOfFile]; NSMutableDictionary *links = [NSMutableDictionary dictionary]; parsing_context context(links, filter, documentURL); map_html_tags((const char *)[data bytes], [data length], (void (*)(struct taginfo*, void*))parse_links, // yikes! &context, MHT_TRIM_VALUES, // trim all whitespace in attributes. NULL, NULL); if ([delegate respondsToSelector:@selector(linkScraperDidFinish:links:)]) { // ok, now we need to tell the delegate we're finished - but on the same thread/runloop we were called // from (in case the delegate itself is a multithreaded app). see below for how! // Hack: since there doesn't seem to be any easier way to perform a selector with two arguments // on a given runloop, create a one-shot timer that will do it for us, and assign it to // the runloop. // // to avoid having to build a nasty NSInvocation (for stuffing the multiple args-selector in a NSTimer) we // use a helper method to first get to the right runloop, and then do the actual linkScraperDidFinish:links: call // from there. NSTimer *t = [NSTimer timerWithTimeInterval:0.0 target:self selector:@selector(dispatchDidFinishCall:) userInfo:links repeats:NO]; // assign the timer to the calling runloop CFRunLoopAddTimer(callingRunloop, (CFRunLoopTimerRef)t, kCFRunLoopDefaultMode); // ensure the runloop is awake to process the timer CFRunLoopWakeUp(callingRunloop); } [pool release]; } // dispatches the linkScraperDidFinish:links: call to the delegate. This method is always invoked // one the same runloop/thread that initially did the scrapeLinks: call. - (void)dispatchDidFinishCall:(NSTimer *)timer { NSDictionary *links = [timer userInfo]; [delegate linkScraperDidFinish:self links:links]; } /* Finds the plaintext version of a HTML chunk. E.g., hello
world will become: "hello world". The caller is responsible for freeing the resulting string. */ + (NSString *)plaintextifyHTML:(NSString *)chunk { // the end result. NSMutableString *plaintext = [NSMutableString string]; NSCharacterSet *tagStart = [NSCharacterSet characterSetWithCharactersInString:@"<"]; NSCharacterSet *tagEnd = [NSCharacterSet characterSetWithCharactersInString:@">"]; NSRange lastFoundTagStart = NSMakeRange(0, 0), lastFoundTagEnd = NSMakeRange(0, 0); NSRange scanRange = NSMakeRange(0, [chunk length]); while ((lastFoundTagStart = [chunk rangeOfCharacterFromSet:tagStart options:0 range:scanRange]).location != NSNotFound) { // '..span class=..>hello world' and the first '<' scanRange.location = lastFoundTagStart.location + lastFoundTagStart.length; scanRange.length = [chunk length] - scanRange.location; // skip all contents and attrs of this tag lastFoundTagEnd = [chunk rangeOfCharacterFromSet:tagEnd options:0 range:scanRange]; if (lastFoundTagEnd.location != NSNotFound) { scanRange.location = lastFoundTagEnd.location + lastFoundTagEnd.length; scanRange.length = [chunk length] - scanRange.location; } } return plaintext; } @end /* Callback for the html-parser. Will add all links, and their names to the context's dictionary */ void parse_links(struct taginfo *tag, parsing_context *ctx) { NSAutoreleasePool *pool = [NSAutoreleasePool new]; // return early if there's nothing interesting. if (!tag || !tag->name) goto done_parsing; if (strcmp(tag->name, "a") == 0) { if (tag->end_tag_p && ctx->_lastStartTagEndPos && ctx->_lastAddedURL) { /* we encountered an tag. let's handle the link contents (i.e. whatever is between ...) */ // TODO: handle url chars (& -> '&') // TODO: guess an encoding instead of using ASCII NSString *linkContentsWrapper = [[[NSString alloc] initWithBytesNoCopy:ctx->_lastStartTagEndPos length:(tag->start_position-ctx->_lastStartTagEndPos) encoding:NSASCIIStringEncoding freeWhenDone:NO] autorelease]; NSString *linkName; if (ctx->_lastStartTagContainsOtherTags) { // this link contained other tags, so we have to plaintextify first. html-parse // doesn't do that for us. linkName = [LinkScraper plaintextifyHTML:linkContentsWrapper]; } else // make a copy since we don't own the linkContentsWrapper's buffer (which will go away). linkName = [[linkContentsWrapper copy] autorelease]; [ctx->links setObject:linkName forKey:ctx->_lastAddedURL]; [ctx->_lastAddedURL release]; ctx->_lastAddedURL = nil; // reset state ctx->_lastStartTagEndPos = 0; ctx->_lastStartTagContainsOtherTags = false; } else { /* handle the URL of a link */ ctx->_lastStartTagEndPos = (char *)tag->end_position; ctx->_lastStartTagContainsOtherTags = false; // look through the attributes int i; for (i=0; inattrs; ++i) { struct attr_pair *pair = &tag->attrs[i]; if (pair && strcmp(pair->name, "href") == 0) { NSString *href = [NSString stringWithUTF8String:pair->value]; [ctx->_lastAddedURL release]; if (ctx->filter && ![href hasSuffix:ctx->filter]) { ctx->_lastAddedURL = nil; continue; } // TODO: worth it to optimize here to avoid creating NSURL objects when we already get an absolute URL? // let NSURL take care of the dirty work of expanding any relative URL to its absolute form. NSURL *absURL = [NSURL URLWithString:href relativeToURL:(ctx->_baseURL ? ctx->_baseURL : ctx->documentURL)]; if (absURL) { // will be released in the next iteration, for example when finding this link's closing tag. ctx->_lastAddedURL = [[absURL absoluteString] copy]; [ctx->links setObject:[NSNull null] forKey:ctx->_lastAddedURL]; } else { // NSURL didn't like the URL we gave it. probably a javascript: URL or some such. // just add the URI as is. ctx->_lastAddedURL = [href copy]; } } } } } else { if (strcmp(tag->name, "base") == 0) { // base tag. let's find out what to do about relative URLs. int i; for (i=0; inattrs; ++i) { if (tag->attrs[i].name && strcmp(tag->attrs[i].name, "href") == 0) { // XXX: handle non-ASCII encoded documents // the context will release this when going away ctx->_baseURL = [[NSURL alloc] initWithString:[NSString stringWithCString:tag->attrs[i].name encoding:NSASCIIStringEncoding]]; } } } // current tag is not an tag if (ctx->_lastStartTagEndPos) ctx->_lastStartTagContainsOtherTags = true; } done_parsing: [pool release]; }