This commit is contained in:
Drone CI
2019-03-24 18:57:59 +01:00
commit 2dc37ba196
4 changed files with 795 additions and 0 deletions

72
src/main.rs Normal file
View File

@@ -0,0 +1,72 @@
#[macro_use]
extern crate structopt;
use structopt::StructOpt;
mod opts;
use opts::Opts;
use std::error::Error;
use std::fs::File;
use std::io;
use std::io::{stdin, Read};
use std::path::PathBuf;
extern crate kuchiki;
use kuchiki::traits::*;
fn read_html<I: Read>(source: &mut I) -> io::Result<String> {
let mut html = String::new();
source.read_to_string(&mut html)?;
Ok(html)
}
fn main() -> Result<(), Box<Error>> {
let stdin = stdin();
let opt = Opts::from_args();
let html = if let Some(path) = opt.input {
let mut file = File::open(&path)?;
read_html(&mut file)?
} else {
let mut handle = stdin.lock();
read_html(&mut handle)?
};
let css_selector = opt.selector;
let document = kuchiki::parse_html().one(html);
for css_match in document.select(&css_selector).unwrap() {
// css_match is a NodeDataRef, but most of the interesting methods are
// on NodeRef. Let's get the underlying NodeRef.
let as_node = css_match.as_node();
// In this example, as_node represents an HTML node like
//
// <p class='foo'>Hello world!</p>"
//
// Which is distinct from just 'Hello world!'. To get rid of that <p>
// tag, we're going to get each element's first child, which will be
// a "text" node.
//
// There are other kinds of nodes, of course. The possibilities are all
// listed in the `NodeData` enum in this crate.
//let text_node = as_node.first_child().unwrap();
// Let's get the actual text in this text node. A text node wraps around
// a RefCell<String>, so we need to call borrow() to get a &str out.
//let text = text_node.as_text().unwrap().borrow();
if let Some(child) = as_node.first_child() {
//TODO: Convert Nodes to String, as this only works for Nodes containing plain text
if let Some(text) = child.as_text() {
// Prints:
//
// "Hello, world!"
// "I love HTML"
println!("{:?}", text.borrow());
}
}
}
Ok(())
}

12
src/opts.rs Normal file
View File

@@ -0,0 +1,12 @@
use std::path::PathBuf;
use structopt::StructOpt;
#[derive(StructOpt, Debug)]
#[structopt(name = "htmlextract")]
pub struct Opts {
#[structopt(short = "s", long = "selector")]
pub selector: String,
#[structopt(parse(from_os_str))]
pub input: Option<PathBuf>,
}