htmlextract/src/main.rs
Shimun 12e32c622e
Some checks are pending
continuous-integration/drone/push Build is passing
omit fmt call
2019-03-26 17:57:40 +01:00

95 lines
2.8 KiB
Rust

use structopt::StructOpt;
mod opts;
use html5ever::serialize::{serialize, SerializeOpts, TraversalScope};
use markup5ever::serialize::Serialize as MSerialize;
use opts::Opts;
use std::error::Error;
use std::fs::File;
use std::io;
use std::io::Write;
use std::io::{stdin, stdout, Read};
use kuchiki::traits::*;
fn read_html<I: Read>(source: &mut I) -> io::Result<String> {
let mut html = String::new();
source.read_to_string(&mut html)?;
Ok(html)
}
fn serialize_node<N: MSerialize>(node: &N) -> Result<String, Box<dyn Error>> {
let mut bytes = Vec::new();
serialize(
&mut bytes,
node,
SerializeOpts {
scripting_enabled: true,
create_missing_parent: false,
traversal_scope: TraversalScope::IncludeNode,
},
)?;
let s = String::from_utf8(bytes)?;
Ok(s)
}
fn main() -> Result<(), Box<Error>> {
let stdin = stdin();
let opt = Opts::from_args();
let html = if let Some(path) = opt.input {
let mut file = File::open(&path)?;
read_html(&mut file)?
} else {
let mut handle = stdin.lock();
read_html(&mut handle)?
};
let mut out: Box<Write> = if let Some(path) = opt.output {
let file = File::create(&path)?;
Box::new(file)
} else {
Box::new(stdout())
};
let css_selector = opt.selector;
let document = kuchiki::parse_html().one(html);
for css_match in document.select(&css_selector).unwrap() {
// css_match is a NodeDataRef, but most of the interesting methods are
// on NodeRef. Let's get the underlying NodeRef.
let as_node = css_match.as_node();
// In this example, as_node represents an HTML node like
//
// <p class='foo'>Hello world!</p>"
//
// Which is distinct from just 'Hello world!'. To get rid of that <p>
// tag, we're going to get each element's first child, which will be
// a "text" node.
//
// There are other kinds of nodes, of course. The possibilities are all
// listed in the `NodeData` enum in this crate.
//let text_node = as_node.first_child().unwrap();
// Let's get the actual text in this text node. A text node wraps around
// a RefCell<String>, so we need to call borrow() to get a &str out.
//let text = text_node.as_text().unwrap().borrow();
out.write_all(serialize_node(as_node)?.as_bytes())?;
/*if let Some(child) = as_node.first_child() {
//TODO: Convert Nodes to String, as this only works for Nodes containing plain text
if let Some(text) = child.as_text() {
// Prints:
//
// "Hello, world!"
// "I love HTML"
println!("{:?}", text.borrow());
}
}*/
}
Ok(())
}