sppd_cli/downloader/
link_fetcher.rs1use crate::errors::AppResult;
2use regex::Regex;
3use scraper::{Html, Selector};
4use std::collections::BTreeMap;
5use std::sync::OnceLock;
6use tracing::info;
7use url::Url;
8
9const MINOR_CONTRACTS_URL: &str = "https://www.hacienda.gob.es/es-es/gobiernoabierto/datos%20abiertos/paginas/contratosmenores.aspx";
11const PUBLIC_TENDERS_URL: &str = "https://www.hacienda.gob.es/es-ES/GobiernoAbierto/Datos%20Abiertos/Paginas/LicitacionesContratante.aspx";
12
13const ZIP_LINK_SELECTOR: &str = r#"a[href$=".zip"]"#;
15const PERIOD_REGEX_PATTERN: &str = r"_(\d+)\.zip$";
16
17static PERIOD_REGEX: OnceLock<Regex> = OnceLock::new();
20
21static ZIP_LINK_SELECTOR_CACHED: OnceLock<Selector> = OnceLock::new();
24
25pub async fn fetch_all_links() -> AppResult<(BTreeMap<String, String>, BTreeMap<String, String>)> {
45 let client = reqwest::Client::new();
46 info!("Fetching minor contracts links");
48 let minor_links = fetch_zip(&client, MINOR_CONTRACTS_URL).await?;
49 info!(
50 periods_found = minor_links.len(),
51 "Minor contracts links fetched"
52 );
53
54 info!("Fetching public tenders links");
55 let public_links = fetch_zip(&client, PUBLIC_TENDERS_URL).await?;
56 info!(
57 periods_found = public_links.len(),
58 "Public tenders links fetched"
59 );
60
61 Ok((minor_links, public_links))
62}
63
64pub async fn fetch_zip(
88 client: &reqwest::Client,
89 input_url: &str,
90) -> AppResult<BTreeMap<String, String>> {
91 let base_url = Url::parse(input_url)?;
93
94 let response = client
96 .get(base_url.as_str())
97 .send()
98 .await?
99 .error_for_status()?
100 .text()
101 .await?;
102
103 parse_zip_links(&response, &base_url)
104}
105
106pub fn parse_zip_links(html: &str, base_url: &Url) -> AppResult<BTreeMap<String, String>> {
117 let document = Html::parse_document(html);
118
119 let mut links: BTreeMap<String, String> = BTreeMap::new();
120
121 let selector = ZIP_LINK_SELECTOR_CACHED.get_or_init(|| {
122 Selector::parse(ZIP_LINK_SELECTOR).expect("ZIP_LINK_SELECTOR is a valid CSS selector")
123 });
124
125 let period_regex = PERIOD_REGEX.get_or_init(|| {
126 Regex::new(PERIOD_REGEX_PATTERN).expect("PERIOD_REGEX_PATTERN is a valid regex pattern")
127 });
128
129 for url in document
130 .select(selector)
131 .filter_map(|el| el.value().attr("href"))
132 .filter_map(|href| base_url.join(href).ok())
133 {
134 if let Some(filename) = url.path_segments().and_then(|mut s| s.next_back()) {
135 if let Some(m) = period_regex.captures(filename).and_then(|c| c.get(1)) {
136 links.insert(m.as_str().to_string(), url.to_string());
137 }
138 }
139 }
140
141 Ok(links)
142}
143
144#[cfg(test)]
145mod tests {
146 use super::parse_zip_links;
147 use url::Url;
148
149 #[test]
150 fn test_parse_zip_links_basic() {
151 let html = r#"
152 <html>
153 <body>
154 <a href="files/data_202301.zip">202301</a>
155 <a href="/downloads/data_202302.zip">202302</a>
156 <a href="https://other.example.com/attachments/data_202303.zip">202303</a>
157 <a href="not_a_zip.txt">skip</a>
158 </body>
159 </html>
160 "#;
161
162 let base = Url::parse("https://example.com/path/").expect("base url");
163 let result = parse_zip_links(html, &base).expect("parse succeeds");
164
165 assert_eq!(
167 result.get("202301").unwrap(),
168 "https://example.com/path/files/data_202301.zip"
169 );
170 assert_eq!(
171 result.get("202302").unwrap(),
172 "https://example.com/downloads/data_202302.zip"
173 );
174 assert_eq!(
175 result.get("202303").unwrap(),
176 "https://other.example.com/attachments/data_202303.zip"
177 );
178 }
179
180 #[test]
181 fn test_parse_zip_links_no_capture() {
182 let html = r#"
183 <html><body>
184 <a href="files/data202301.zip">no underscore</a>
185 <a href="files/data_abc.zip">non-numeric</a>
186 </body></html>
187 "#;
188
189 let base = Url::parse("https://example.com/").expect("base url");
190 let result = parse_zip_links(html, &base).expect("parse succeeds");
191 assert!(result.is_empty());
193 }
194
195 #[test]
196 fn test_parse_zip_links_multiple_underscores_uses_last_capture() {
197 let html = r#"
198 <html><body>
199 <a href="files/prefix_2023_202301.zip">multi</a>
200 </body></html>
201 "#;
202
203 let base = Url::parse("https://example.com/").expect("base url");
204 let result = parse_zip_links(html, &base).expect("parse succeeds");
205 assert_eq!(
207 result.get("202301").unwrap(),
208 "https://example.com/files/prefix_2023_202301.zip"
209 );
210 }
211
212 #[test]
213 fn test_parse_zip_links_duplicate_periods_last_wins() {
214 let html = r#"
215 <html><body>
216 <a href="files/data_202301.zip">first</a>
217 <a href="files/other_202301.zip">second</a>
218 </body></html>
219 "#;
220
221 let base = Url::parse("https://example.com/").expect("base url");
222 let result = parse_zip_links(html, &base).expect("parse succeeds");
223 assert_eq!(
225 result.get("202301").unwrap(),
226 "https://example.com/files/other_202301.zip"
227 );
228 }
229
230 #[test]
231 fn test_parse_zip_links_relative_paths_resolve() {
232 let html = r#"
233 <html><body>
234 <a href="./files/data_202304.zip">rel</a>
235 <a href="../up/data_202305.zip">up</a>
236 </body></html>
237 "#;
238
239 let base = Url::parse("https://example.com/path/sub/").expect("base url");
240 let result = parse_zip_links(html, &base).expect("parse succeeds");
241 assert_eq!(
242 result.get("202304").unwrap(),
243 "https://example.com/path/sub/files/data_202304.zip"
244 );
245 assert_eq!(
246 result.get("202305").unwrap(),
247 "https://example.com/path/up/data_202305.zip"
248 );
249 }
250}