init
This commit is contained in:
72
src/main.rs
Normal file
72
src/main.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
#[macro_use]
|
||||
extern crate structopt;
|
||||
use structopt::StructOpt;
|
||||
|
||||
mod opts;
|
||||
|
||||
use opts::Opts;
|
||||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{stdin, Read};
|
||||
use std::path::PathBuf;
|
||||
extern crate kuchiki;
|
||||
|
||||
use kuchiki::traits::*;
|
||||
|
||||
fn read_html<I: Read>(source: &mut I) -> io::Result<String> {
|
||||
let mut html = String::new();
|
||||
source.read_to_string(&mut html)?;
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let stdin = stdin();
|
||||
let opt = Opts::from_args();
|
||||
|
||||
let html = if let Some(path) = opt.input {
|
||||
let mut file = File::open(&path)?;
|
||||
read_html(&mut file)?
|
||||
} else {
|
||||
let mut handle = stdin.lock();
|
||||
read_html(&mut handle)?
|
||||
};
|
||||
|
||||
let css_selector = opt.selector;
|
||||
|
||||
let document = kuchiki::parse_html().one(html);
|
||||
|
||||
for css_match in document.select(&css_selector).unwrap() {
|
||||
// css_match is a NodeDataRef, but most of the interesting methods are
|
||||
// on NodeRef. Let's get the underlying NodeRef.
|
||||
let as_node = css_match.as_node();
|
||||
|
||||
// In this example, as_node represents an HTML node like
|
||||
//
|
||||
// <p class='foo'>Hello world!</p>"
|
||||
//
|
||||
// Which is distinct from just 'Hello world!'. To get rid of that <p>
|
||||
// tag, we're going to get each element's first child, which will be
|
||||
// a "text" node.
|
||||
//
|
||||
// There are other kinds of nodes, of course. The possibilities are all
|
||||
// listed in the `NodeData` enum in this crate.
|
||||
//let text_node = as_node.first_child().unwrap();
|
||||
|
||||
// Let's get the actual text in this text node. A text node wraps around
|
||||
// a RefCell<String>, so we need to call borrow() to get a &str out.
|
||||
//let text = text_node.as_text().unwrap().borrow();
|
||||
|
||||
if let Some(child) = as_node.first_child() {
|
||||
//TODO: Convert Nodes to String, as this only works for Nodes containing plain text
|
||||
if let Some(text) = child.as_text() {
|
||||
// Prints:
|
||||
//
|
||||
// "Hello, world!"
|
||||
// "I love HTML"
|
||||
println!("{:?}", text.borrow());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
12
src/opts.rs
Normal file
12
src/opts.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
use std::path::PathBuf;
|
||||
use structopt::StructOpt;
|
||||
|
||||
#[derive(StructOpt, Debug)]
|
||||
#[structopt(name = "htmlextract")]
|
||||
pub struct Opts {
|
||||
#[structopt(short = "s", long = "selector")]
|
||||
pub selector: String,
|
||||
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub input: Option<PathBuf>,
|
||||
}
|
Reference in New Issue
Block a user