95 lines
2.8 KiB
Rust
95 lines
2.8 KiB
Rust
use structopt::StructOpt;
|
|
|
|
mod opts;
|
|
|
|
use html5ever::serialize::{serialize, SerializeOpts, TraversalScope};
|
|
use markup5ever::serialize::Serialize as MSerialize;
|
|
use opts::Opts;
|
|
use std::error::Error;
|
|
use std::fs::File;
|
|
use std::io;
|
|
use std::io::Write;
|
|
use std::io::{stdin, stdout, Read};
|
|
|
|
use kuchiki::traits::*;
|
|
|
|
fn read_html<I: Read>(source: &mut I) -> io::Result<String> {
|
|
let mut html = String::new();
|
|
source.read_to_string(&mut html)?;
|
|
Ok(html)
|
|
}
|
|
|
|
fn serialize_node<N: MSerialize>(node: &N) -> Result<String, Box<dyn Error>> {
|
|
let mut bytes = Vec::new();
|
|
serialize(
|
|
&mut bytes,
|
|
node,
|
|
SerializeOpts {
|
|
scripting_enabled: true,
|
|
create_missing_parent: false,
|
|
traversal_scope: TraversalScope::IncludeNode,
|
|
},
|
|
)?;
|
|
let s = String::from_utf8(bytes)?;
|
|
Ok(s)
|
|
}
|
|
|
|
fn main() -> Result<(), Box<Error>> {
|
|
let stdin = stdin();
|
|
let opt = Opts::from_args();
|
|
|
|
let html = if let Some(path) = opt.input {
|
|
let mut file = File::open(&path)?;
|
|
read_html(&mut file)?
|
|
} else {
|
|
let mut handle = stdin.lock();
|
|
read_html(&mut handle)?
|
|
};
|
|
|
|
let mut out: Box<Write> = if let Some(path) = opt.output {
|
|
let file = File::create(&path)?;
|
|
Box::new(file)
|
|
} else {
|
|
Box::new(stdout())
|
|
};
|
|
|
|
let css_selector = opt.selector;
|
|
|
|
let document = kuchiki::parse_html().one(html);
|
|
|
|
for css_match in document.select(&css_selector).unwrap() {
|
|
// css_match is a NodeDataRef, but most of the interesting methods are
|
|
// on NodeRef. Let's get the underlying NodeRef.
|
|
let as_node = css_match.as_node();
|
|
|
|
// In this example, as_node represents an HTML node like
|
|
//
|
|
// <p class='foo'>Hello world!</p>"
|
|
//
|
|
// Which is distinct from just 'Hello world!'. To get rid of that <p>
|
|
// tag, we're going to get each element's first child, which will be
|
|
// a "text" node.
|
|
//
|
|
// There are other kinds of nodes, of course. The possibilities are all
|
|
// listed in the `NodeData` enum in this crate.
|
|
//let text_node = as_node.first_child().unwrap();
|
|
|
|
// Let's get the actual text in this text node. A text node wraps around
|
|
// a RefCell<String>, so we need to call borrow() to get a &str out.
|
|
//let text = text_node.as_text().unwrap().borrow();
|
|
out.write_all(serialize_node(as_node)?.as_bytes())?;
|
|
/*if let Some(child) = as_node.first_child() {
|
|
|
|
//TODO: Convert Nodes to String, as this only works for Nodes containing plain text
|
|
if let Some(text) = child.as_text() {
|
|
// Prints:
|
|
//
|
|
// "Hello, world!"
|
|
// "I love HTML"
|
|
println!("{:?}", text.borrow());
|
|
}
|
|
}*/
|
|
}
|
|
Ok(())
|
|
}
|