KSoup is an HTML parser for Kotlin built on top of JSoup. It provides a very convenient DSL for extracting and manipulating data from HTML documents.
Getting Started
Add dependency:
implementation("io.github.webtools:ksoup:0.3.0")
Parse HTML:
val html = """
  <html>
  ...
  </html>
"""
val doc = KSoup.parse(html)
Find elements:
doc.select(".content")
Extract text:
doc.body()!!.text()
Selecting
By CSS query:
doc.select(".main")
By tag:
doc.select("img")
By id:
doc.getElementById("header")
By attribute:
doc.select("[href]")
Custom filters:
doc.select(".txt").filter { it.text().length > 10 }
Traversing
Children:
element.children()
Parents:
element.parents()
Siblings:
element.nextSibling()
element.previousSibling()
Manipulation
Set text:
element.text("new text")
Set HTML:
element.html("<span>new html</span>")
Add class:
element.addClass("highlighted")
Remove class:
element.removeClass("highlighted")
Remove element:
element.remove()
Attributes
Get attribute:
element.attr("href")
Set attribute:
element.attr("href", "link.html")
Remove attribute:
element.removeAttr("class")
Examples
Extract text from paragraphs:
doc.select("p").forEach {
  println(it.text())
}
Extract links:
doc.select("a[href]").forEach {
  println(it.attr("href"))
}
Change image src:
doc.select("img").forEach {
  it.attr("src", "new.png")
}
Validation
Check valid HTML:
val errors = KSoupValidator().validate(doc)
if (errors.isNotEmpty()) {
  // handle errors
}
Advanced Usage
Async parsing:
KSoup.parseAsync(html) { doc ->
  // process doc
}
Multi-threading:
docs.map { doc ->
  thread {
    // extract data
  }
}
More Examples
val links = doc.select("a[href]").map { it.attr("href") }
val headers = doc.select("h1, h2, h3").map { it.text() }
doc.select(".ad").remove()
Tips & Tricks
doc.select(".news").hasClass("updated")
doc.select("a").hasAttr("target")
doc.select(".news").has("img")
val html = doc.select("p").outerHtml()
KSoup.parse(htmlFragment, "")
Threading
GlobalScope.launch {
  val doc = KSoup.parseAsync(html)
  // process doc
}
doc.select(".chapter").map { chapter ->
  thread {
    // extract data from each chapter
  }
}
Validation
val rules = object : ValidatorRules {
  override fun getTagRules() = //...
}
KSoupValidator(rules).validate(doc)
KSoupValidator().ignore(MissingAltText::class).validate(doc)
KSoupValidator().autoCorrect().validate(doc)