The Ultimate HTMLParser Cheatsheet

Oct 31, 2023 ยท 4 min read

HTMLParser is an Objective-C wrapper for libxml2 that allows parsing HTML documents. It provides an event-driven interface like NSXMLParser.

Getting Started

Import:

#import "HTMLParser.h"

Initialize:

NSData *data = ...; // HTML data
HTMLParser *parser = [[HTMLParser alloc] initWithData:data];

Set delegate:

parser.delegate = self;

Start parsing:

[parser parse];

Delegate Methods

Did start document:

- (void)parserDidStartDocument:(HTMLParser *)parser {
  // Parsing started
}

Did end document:

- (void)parserDidEndDocument:(HTMLParser *)parser {
  // Parsing done
}

Did start element:

- (void)parser:(HTMLParser *)parser
 didStartElement:(NSString *)elementName
   attributes:(NSDictionary *)attributeDict {

  // Element opened
}

Did end element:

- (void)parser:(HTMLParser *)parser
 didEndElement:(NSString *)elementName {

  // Element closed
}

Found characters:

- (void)parser:(HTMLParser *)parser
 foundCharacters:(NSString *)string {

  // Found text
}

Parsing Options

No error reporting:

parser.reportErrors = NO;

Allow broken HTML:

parser.options.HTML_PARSE_NOERROR = YES;

Validation

Check errors:

NSArray *errors = [parser errors];
if([errors count] > 0) {
  // Handle errors
}

Tips

  • Set reportErrors to NO to ignore warnings
  • Delegate methods called on main thread
  • Use HTML_PARSE_NOERROR for malformed HTML
  • Reuse parser instances for better performance
  • Examples

    Simple HTML Parsing:

    @interface ParserDelegate : NSObject <HTMLParserDelegate>
    @end
    
    @implementation ParserDelegate
    
    - (void)parser:(HTMLParser *)parser
     didStartElement:(NSString *)elementName
       attributes:(NSDictionary *)attributeDict {
    
      NSLog(@"Started element: %@", elementName);
    }
    
    @end
    
    NSData *htmlData = ...; // html
    
    HTMLParser *parser = [[HTMLParser alloc] initWithData:htmlData];
    ParserDelegate *delegate = [[ParserDelegate alloc] init];
    parser.delegate = delegate;
    
    [parser parse];
    

    Extract Text:

    NSMutableString *text;
    
    - (void)parser:(HTMLParser *)parser
     didStartElement:(NSString *)elementName {
    
      text = [[NSMutableString alloc] init];
    
    }
    
    - (void)parser:(HTMLParser *)parser
     foundCharacters:(NSString *)string {
    
      [text appendString:string];
    
    }
    
    - (void)parserDidEndDocument:(HTMLParser *)parser {
    
      NSLog(@"Text: %@", text);
    
    }
    

    Entity Conversion

    Custom resolver:

    @interface EntityResolver : NSObject <HTMLParserDelegate>
    @end
    
    @implementation EntityResolver
    
    - (NSData *)parser:(HTMLParser *)parser
     resolveExternalEntity:(NSString *)name {
    
      if(name == ...) {
        return replacementData;
      } else {
        return nil;
      }
    
    }
    
    @end
    

    Writing HTML

    XMLElement:

    XMLElement *element = [[XMLElement alloc] initWithName:@"div"];
    [element setStringValue:@"Hello"];
    
    NSString *html = [element XMLString];
    

    XMLNode:

    XMLNode *node = [[XMLNode alloc] initWithName:@"span"];
    [node setStringValue:@"world"];
    [element addChild:node];
    

    Advanced Usage

    Incremental parsing:

    while(hasMoreData) {
      NSData *chunk = getNextDataChunk();
    
      [parser parseChunk:chunk];
    }
    
    [parser finishParsing];
    

    Custom input stream:

    NSInputStream *stream = ...; // custom stream
    
    HTMLParser *parser = [[HTMLParser alloc] initWithStream:stream];
    
    // read and parse chunks
    

    Capture dynamic HTML:

    // with UIWebView
    
    NSString *html = [webView stringByEvaluatingJavaScript:@"document.body.innerHTML"];
    
    HTMLParser *parser = [[HTMLParser alloc] initWithData:html];
    

    Debugging

    Parser errors:

    NSArray *errors = [parser errors];
    
    for(NSError *error in errors) {
      NSLog(@"%@", error.localizedDescription);
    }
    

    Enable info messages:

    [HTMLParser setInfoMessageLoggingEnabled:YES];
    

    Handling Common Errors

    HTML parsing can result in errors due to malformed content or encoding issues. Always check the errors array:

    NSArray *errors = [parser errors];
    if([errors count] > 0) {
      // Handle errors
    }
    

    For malformed HTML, use the HTML_PARSE_NOERROR option:

    parser.options.HTML_PARSE_NOERROR = YES;
    

    To handle encoding errors, specify the encoding:

    [parser setEncoding:NSUTF8StringEncoding];
    

    More Delegate Examples

    Extract all images:

    - (void)parser:(HTMLParser *)parser
     didStartElement:(NSString *)elementName
       attributes:(NSDictionary *)attributeDict {
    
      if([elementName isEqualToString:@"img"]) {
        NSString *src = attributeDict[@"src"];
        // Download image
      }
    
    }
    

    Find link URLs:

    - (void)parser:(HTMLParser *)parser
     didStartElement:(NSString *)elementName
       attributes:(NSDictionary *)attributeDict {
    
      if([elementName isEqualToString:@"a"]) {
        NSString *href = attributeDict[@"href"];
        // Store link url
      }
    
    }
    

    Performance Optimization

    Reuse parser instances:

    HTMLParser *parser = [[HTMLParser alloc] init];
    
    for(NSData *html in htmlDataArray) {
      [parser parse:html];
    }
    

    Incremental parsing:

    while(hasDataAvailable) {
      NSData *chunk = [self getNextDataChunk];
      [parser parseChunk:chunk];
    }
    
    [parser finishParsing];
    

    Advanced Usage

    Parse HTML from a UIWebView:

    NSString *html = [webView stringByEvaluatingJavaScript:@"document.body.innerHTML"];
    
    HTMLParser *parser = [[HTMLParser alloc] initWithData:[html dataUsingEncoding:NSUTF8StringEncoding]];
    

    Use a custom input stream:

    NSInputStream *stream = [NSInputStream inputStreamWithURL:url];
    
    HTMLParser *parser = [[HTMLParser alloc] initWithStream:stream];
    
    while([stream hasBytesAvailable]) {
      [parser parseChunk:[stream readDataOfLength:4096]];
    }
    

    Generating HTML

    Build a complex HTML structure:

    XMLElement *html = [[XMLElement alloc] initWithName:@"html"];
    
    XMLElement *head = [[XMLElement alloc] initWithName:@"head"];
    [html addChild:head];
    
    XMLElement *body = [[XMLElement alloc] initWithName:@"body"];
    [html addChild:body];
    
    XMLNode *h1 = [[XMLNode alloc] initWithName:@"h1"];
    [h1 setStringValue:@"Hello World!"];
    [body addChild:h1];
    
    NSString *htmlString = [html XMLString];
    

    Troubleshooting

    Enable parser info messages:

    [HTMLParser setInfoMessageLoggingEnabled:YES];
    

    Log validation errors:

    NSArray *errors = [parser validationErrors];
    for(NSError *error in errors) {
      NSLog(@"%@", error);
    }
    

    Thread Safety

    Perform parsing on a background thread:

    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
    
      HTMLParser *parser = [[HTMLParser alloc] initWithData:data];
      [parser parse];
    
    });
    

    Browse by tags:

    Browse by language:

    Tired of getting blocked while scraping the web?

    ProxiesAPI handles headless browsers and rotates proxies for you.
    Get access to 1,000 free API credits, no credit card required!