Tiempo de lectura: 3 minutos
A couple of months ago, I started reading about Rust, and after purchasing an API for web scraping called RocketScrape, I decided to write my first program.
To begin, you will need to add some dependencies: select, tokio, and reqwest crates, respectively.
[dependencies] select = "0.5.0" tokio = { version = "1", features = ["full"] } reqwest = "0.11.4"
And here is the code:
use select::document::Document; use select::predicate::{Attr, Name, Predicate}; const NO_VALUE: &str = "NOT FOUND"; const APIKEY: &str = "write here your API number"; async fn scrapebooks(i: &i32) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { let rocketapi = format!("https://api.rocketscrap.io/?apiKey={}&url=", APIKEY); let site_url = format!("http://books.toscrape.com/catalogue/page-{}.html", i); let url = format!("{}{}", rocketapi, site_url); let response = reqwest::get(&url).await?; if response.status().is_success() { let response = response.text().await?; let document = Document::from(response.as_str()); for node in document.find(Name("article")) { let name = node .find(Name("h3").descendant(Name("a"))) .next() .map_or(NO_VALUE.into(), |n| n.text()); let price = node .find(Attr("class", "price_color")) .next() .map_or(NO_VALUE.into(), |n| n.text()); let link = node .find(Name("h3").descendant(Name("a"))) .filter_map(|n| n.attr("href")) .next() // .nth(0) .map_or(NO_VALUE.into(), |n| n.to_string()); let photo = node .find(Name("article") .descendant(Name("div")) .descendant(Name("a")) .descendant(Name("img")) ) .filter_map(|n| n.attr("src")) .next() // .nth(0) .map_or(NO_VALUE.into(), |n| n.to_string()); println!("{:?},{:?},{:?},{:?},{:?}", site_url, name, price, link, photo); } } else { println!("Status: {:?} - Response failed from: {}", response.status(), &url); } Ok(()) } #[tokio::main] async fn main() -> Result<(), Box<dyn std::error::Error>> { let mut handles: std::vec::Vec<_> = Vec::new(); for i in 1..=40 { let job = tokio::spawn(async move { scrapebooks(&i).await }); handles.push(job); } let mut results = Vec::new(); for job in handles { results.push(job.await); } Ok(()) }
After running the program, the results will be displayed on the screen, simulating a CSV output.
The program will run quite fast, taking less than 2 seconds to grab 40 pages of results.
This article has been heavily inspired by the work of Xavier Tao at https://able.bio/haixuanTao/web-scraper-python-vs-rust–d6176429