The Ultimate Select.rs Cheat Sheet for Rust

Oct 31, 2023 · 5 min read

select.rs is a robust HTML/XML scraping library for Rust. This cheat sheet aims to cover its features in depth.

Installation

[dependencies]
select = "0.6"

Loading Documents

From str:

let doc = Document::from(r#"<html>...</html>"#);

From file:

let doc = Document::from_file("page.html")?;

From URL:

let resp = reqwest::get("<https://example.com>")?;
let doc = Document::from_read(resp)?;

From bytes:

let bytes: Vec<u8> = fetch_bytes();
let doc = Document::from_bytes(bytes)?;

Custom options:

let doc = Document::from_file("doc.html", ParserOptions::no_network())?;

Selecting Nodes

CSS selector:

let paras = doc.find("p");

XPath selector:

let items = doc.xpath("//ul/li");

Namespaced XPath:

doc.xpath("//x:item", [("x", "<http://example.com>")]);

Cache selector:

let query = static "./product";
let prods = doc.find(query);

Traversing Nodes

Parent:

let parent = node.parent();

Children:

let children = parent.children();

Previous sibling:

let prev = node.prev_sibling();

Next sibling:

let next = node.next_sibling();

Ancestors:

let ancestors = node.ancestors();

Descendants:

let descendants = node.descendants();

Extracting Data

Text from node:

let text = node.text();

Text from selector:

let text = doc.find("p").text();

HTML from node:

let html = node.html();

HTML from selector:

let html = doc.find("div").html();

Modifying Nodes

Set class:

node.set_class("blue");

Set id:

node.set_id("main");

Set attribute:

node.set_attr("lang", "en");

Set text:

node.text_mut().push("Hello!");

Set HTML:

node.html_mut().push_str("<strong>Hi</strong>");

Creating Nodes

New element:

let para = Element::new("p");

New text node:

let text = Text::new("Hello");

New comment:

let comment = Comment::new("A comment");

New fragment:

let frag = DocumentFragment::new();

Inserting Nodes

Append child:

parent.append(node);

Insert before:

parent.insert_before(new_node, ref_node);

Insert after:

parent.insert_after(new_node, ref_node);

Prepend child:

parent.prepend(new_node);

Removing Nodes

Remove node:

parent.remove(child);

Remove children:

parent.remove_children();

Remove by selector:

doc.find(".deleted").remove();

Output Formats

HTML string:

let html = doc.to_string();

Plain text:

let text = doc.text();

YAML data:

let data = doc.to_yaml_vec();

JSON data:

let json = serde_json::to_string(&doc)?;

Caching and Persistence

In-memory cache:

use std::collections::HashMap;

let mut cache: HashMap<String, Document> = HashMap::new();

// Cache document
cache.insert("url".into(), doc.clone());

Persistent cache:

let cached = storage.restore("url");
if let Some(doc) = cached {
  // Use cached doc
} else {
  // Fetch and cache
}

Headless Browsers

Integrate with playwright:

let html = pw::get_document("<https://example.com>").await?.html();
let doc = Document::from(html);

Validation

DTD validate:

let mut opts = ParserOptions::default();
opts.validate_dtd = true;
Document::from_file("file.html", opts)?;

XSD validate:

let schema = include_bytes!("schema.xsd");
doc.validate_xsd(schema)?;

Encoding

Handle special chars:

let text = node.text()
   .decode_lossy("windows-1252");

Convert encoding:

use encoding::{EncoderTrap, Encoding};

let text = "àèì";
let encoded = text.encode(Encoding::ISO_8859_1, EncoderTrap::Strict);

Real World Use Cases

  • Web scrapers and crawlers
  • Data mining pipelines
  • Web automation bots
  • Archiving sites
  • Paywall/tracker bypassing
  • Comparing documents
  • Extracting research datasets
  • Migrating between CMSs
  • PDF generation
  • Building static site generators
  • Automated testing suites
  • Screenshot testing tools
  • ## Advanced Selectors
    
    Attribute selector:
    
    ```rust
    let inputs = doc.find(r#"input[type="text"]"#);
    

    Pseudo selector:

    let paras = doc.find("p:nth-child(2)");
    

    Combining selectors:

    let elems = doc.find("div.container > p.text");
    

    Chaining selectors:

    let links = doc.find("div.post").find("a");
    

    Selector Contexts

    On document:

    let items = doc.find("li");
    

    On node:

    let links = node.find("a");
    

    On vector:

    let texts = nodes.find("p").text();
    

    On fragment:

    let paras = fragment.find("p");
    

    Caching and Performance

    Caching parsed documents:

    let doc = cache.get("url").cloned().unwrap_or_else(|| {
      let doc = Document::from_file(url)?;
      cache.insert(url, doc.clone());
      doc
    });
    

    Parallelizing scraping:

    let docs = urls.par_iter().map(fetch_page).collect();
    

    Reusing scrapers:

    #[derive(Clone)]
    struct PageScraper {
      // ...
    }
    
    let scraper = PageScraper::new();
    scraper.scrape(doc1);
    scraper.scrape(doc2);
    

    Common Recipes

    Scraping tables:

    let rows = doc.find("table tr").filter(|r| !r.find("th").exists());
    

    Paginating content:

    let urls = discover_pages();
    for url in urls {
      let doc = fetch_page(url);
      // scrape page
    }
    

    Extracting metadata:

    let title = doc.find("meta[name=title]").attr("content");
    

    Troubleshooting

    Network error handling:

    match reqwest::get(url) {
      Ok(resp) => // ...
      Err(e) if e.is_timeout() => // Retry
      Err(e) => // Report error
    }
    

    Malformed HTML recovery:

    let doc = Document::recover_from_read(html);
    

    Invalid XPath debugging:

    let parsed = xpaths::parse(xpath);
    if let Err(e) = parsed {
      // Log or display error
    }
    

    Ecosystem Libraries

    HTML parsing with html5ever:

    let html = // ...
    let dom = html5ever::parse_document(html.as_bytes());
    

    Lightweight DOM with kuchiki:

    let dom = kuchiki::parse_html().one(html);
    

    High-level scraping with scraper:

    let page = scraper::Html::parse_document(html);
    let books = page.select(&selector);
    

    Browse by tags:

    Browse by language:

    The easiest way to do Web Scraping

    Get HTML from any page with a simple API call. We handle proxy rotation, browser identities, automatic retries, CAPTCHAs, JavaScript rendering, etc automatically for you


    Try ProxiesAPI for free

    curl "http://api.proxiesapi.com/?key=API_KEY&url=https://example.com"

    <!doctype html>
    <html>
    <head>
        <title>Example Domain</title>
        <meta charset="utf-8" />
        <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
    ...

    X

    Don't leave just yet!

    Enter your email below to claim your free API key: