Skip to content

Commit 53f78b9

Browse files
committed
support finding feed URL from HTML page
1 parent 7b06b88 commit 53f78b9

File tree

6 files changed

+160
-1
lines changed

6 files changed

+160
-1
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ quick-xml = "0.18.1"
3535
log = "0.4.11"
3636
femme = "2.1.0"
3737
url = "2.1.1"
38+
either = "1.5.3"
3839

3940
[dev-dependencies]
4041
rand = "0.7"

src/cli.rs

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
use anyhow::{anyhow, Context, Result};
22
use async_std::prelude::FutureExt;
3+
use either::Either;
34
use futures::stream::{self, StreamExt};
45
use log::{info, warn};
56
use prettytable::{cell, format, row, Table};
7+
use std::io::{self, BufRead, Write};
68
use std::path::PathBuf;
79
use structopt::StructOpt;
810

@@ -51,6 +53,74 @@ impl FeedCommand {
5153
Ok(())
5254
}
5355

56+
async fn select_remotes(state: &State, candidates: Vec<String>) -> Result<RemoteFeed> {
57+
if candidates.is_empty() {
58+
return Err(anyhow!(
59+
"Supplied URL is not a feed, and we can't find any potential candidate in the page."
60+
));
61+
}
62+
63+
let length = candidates.len();
64+
if length == 1 {
65+
let url = candidates.first().unwrap();
66+
log::info!(
67+
"Supplied URL is not a feed, but we found a potential candidate: {}",
68+
url
69+
);
70+
return Ok(RemoteFeed::new(url).await?);
71+
}
72+
73+
println!(
74+
"Supplied URL is not a feed, but we found {} potential candidates. Please select one:",
75+
length
76+
);
77+
78+
for (idx, url) in candidates.iter().enumerate() {
79+
println!("{}) {}", idx, url);
80+
}
81+
82+
let stdin = io::stdin();
83+
loop {
84+
print!("select (0-{}, c to cancel): ", length - 1);
85+
io::stdout().flush()?;
86+
87+
let mut selection = String::new();
88+
stdin.lock().read_line(&mut selection)?;
89+
90+
let selection = selection.trim();
91+
if selection == "c" {
92+
break Err(anyhow!("No selection was made."));
93+
}
94+
95+
match selection.parse::<usize>() {
96+
Ok(select) if select < length => {
97+
let url = candidates.get(select).unwrap();
98+
99+
let feed = {
100+
let conn = state.db.get()?;
101+
Feed::get_by_url(&conn, &url)?
102+
};
103+
104+
if feed.is_some() {
105+
println!("Error: Invalid selection: selected feed already exists");
106+
continue;
107+
}
108+
109+
match RemoteFeed::new(candidates.get(select).unwrap()).await {
110+
Ok(feed) => break Ok(feed),
111+
Err(e) => println!("Error: Selection is not a feed: {}", e),
112+
}
113+
}
114+
Ok(_) => {
115+
println!("Error: Invalid selection: out of range");
116+
}
117+
Err(e) => {
118+
println!("Error: Invalid selection: {}", e);
119+
}
120+
}
121+
}
122+
}
123+
54124
async fn add(state: State, url: String, group: Option<String>) -> Result<()> {
55125
let feed = {
56126
let conn = state.db.get()?;
@@ -61,7 +131,12 @@ impl FeedCommand {
61131
return Err(anyhow!("Feed `{}` already exists!", url));
62132
}
63133

64-
let remote = RemoteFeed::new(&url).await?;
134+
let remote = match RemoteFeed::try_new(&url).await? {
135+
Either::Left(remote) => remote,
136+
Either::Right(candidates) => Self::select_remotes(&state, candidates).await?,
137+
};
138+
139+
let url = remote.get_url().to_owned();
65140

66141
let feed = Feed::new(
67142
remote

src/find.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/// Finds Feed urls on a web page.
2+
use quick_xml::events::Event;
3+
use quick_xml::Reader;
4+
use std::io::BufRead;
5+
6+
use crate::error::Result;
7+
8+
/// Parses HTML page to find `<link rel="alternate" />` and extract hrefs.
9+
pub fn find_rel_alternates<B: BufRead>(reader: B) -> Result<Vec<String>> {
10+
let mut reader = Reader::from_reader(reader);
11+
reader.check_end_names(false);
12+
13+
let mut buf = Vec::new();
14+
let mut result = Vec::new();
15+
16+
loop {
17+
match reader.read_event(&mut buf) {
18+
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) if e.name() == b"link" => {
19+
if e.attributes().fold(false, |acc, attr| {
20+
if acc {
21+
acc
22+
} else if let Ok(attr) = attr {
23+
attr.key == b"rel" && attr.value.as_ref() == b"alternate"
24+
} else {
25+
false
26+
}
27+
}) {
28+
if let Some(url) = e
29+
.attributes()
30+
.filter_map(|attr| {
31+
if let Ok(attr) = attr {
32+
if attr.key == b"href" {
33+
String::from_utf8(attr.value.into_owned()).ok()
34+
} else {
35+
None
36+
}
37+
} else {
38+
None
39+
}
40+
})
41+
.next()
42+
{
43+
result.push(url);
44+
}
45+
}
46+
}
47+
Ok(Event::Eof) => break,
48+
Err(e) => {
49+
return Err((e, reader.buffer_position()).into());
50+
}
51+
_ => (),
52+
}
53+
}
54+
55+
Ok(result)
56+
}

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod error;
66
mod api;
77
mod cli;
88
mod crawler;
9+
mod find;
910
pub mod model;
1011
mod opml;
1112
mod remote;

src/remote.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
use either::Either;
2+
13
use crate::error::Result;
4+
use crate::find::find_rel_alternates;
25

36
pub struct RemoteFeed {
47
url: String,
@@ -16,6 +19,18 @@ impl RemoteFeed {
1619
})
1720
}
1821

22+
/// Attempts to fetch and parse feed from the given url
23+
pub async fn try_new(url: &str) -> Result<Either<Self, Vec<String>>> {
24+
let bytes = surf::get(url).await?.body_bytes().await?;
25+
match feed_rs::parser::parse(&bytes[..]) {
26+
Ok(feed) => Ok(Either::Left(RemoteFeed {
27+
url: url.to_owned(),
28+
feed,
29+
})),
30+
Err(_) => Ok(Either::Right(find_rel_alternates(&bytes[..])?)),
31+
}
32+
}
33+
1934
pub fn get_title(&self) -> Option<String> {
2035
self.feed.title.as_ref().map(|t| t.content.clone())
2136
}
@@ -29,4 +44,8 @@ impl RemoteFeed {
2944
.next()
3045
.map(|x| x.to_owned())
3146
}
47+
48+
pub fn get_url(&self) -> &str {
49+
&self.url
50+
}
3251
}

0 commit comments

Comments
 (0)