sppd_cli/downloader/
link_fetcher.rs

1use crate::errors::AppResult;
2use regex::Regex;
3use scraper::{Html, Selector};
4use std::collections::BTreeMap;
5use std::sync::OnceLock;
6use tracing::info;
7use url::Url;
8
9// Data source URLs
10const MINOR_CONTRACTS_URL: &str = "https://www.hacienda.gob.es/es-es/gobiernoabierto/datos%20abiertos/paginas/contratosmenores.aspx";
11const PUBLIC_TENDERS_URL: &str = "https://www.hacienda.gob.es/es-ES/GobiernoAbierto/Datos%20Abiertos/Paginas/LicitacionesContratante.aspx";
12
13// Selectors and Patterns
14const ZIP_LINK_SELECTOR: &str = r#"a[href$=".zip"]"#;
15const PERIOD_REGEX_PATTERN: &str = r"_(\d+)\.zip$";
16
17/// Cached regex for extracting period identifiers from ZIP filenames.
18/// Compiled once at initialization for performance.
19static PERIOD_REGEX: OnceLock<Regex> = OnceLock::new();
20
21/// Cached CSS selector for ZIP file links.
22/// Compiled once at initialization for performance.
23static ZIP_LINK_SELECTOR_CACHED: OnceLock<Selector> = OnceLock::new();
24
25/// Fetches all available ZIP file links from both procurement data sources.
26///
27/// This function sequentially fetches links from both the minor contracts and
28/// public tenders data source pages. It parses HTML to extract ZIP file links
29/// and extracts period identifiers (e.g., "202301") from filenames.
30///
31/// # Returns
32///
33/// Returns a tuple containing maps of period strings to download URLs:
34/// - **First element**: Minor contracts links (period -> URL)
35/// - **Second element**: Public tenders links (period -> URL)
36///
37/// # Errors
38///
39/// Returns an error if:
40/// - Network requests fail
41/// - HTML parsing fails
42/// - URLs cannot be parsed
43///
44pub async fn fetch_all_links() -> AppResult<(BTreeMap<String, String>, BTreeMap<String, String>)> {
45    let client = reqwest::Client::new();
46    // Sequential fetch: simple and reliable for two landing pages.
47    info!("Fetching minor contracts links");
48    let minor_links = fetch_zip(&client, MINOR_CONTRACTS_URL).await?;
49    info!(
50        periods_found = minor_links.len(),
51        "Minor contracts links fetched"
52    );
53
54    info!("Fetching public tenders links");
55    let public_links = fetch_zip(&client, PUBLIC_TENDERS_URL).await?;
56    info!(
57        periods_found = public_links.len(),
58        "Public tenders links fetched"
59    );
60
61    Ok((minor_links, public_links))
62}
63
64/// Fetches ZIP file links from a single procurement data page.
65///
66/// Downloads the HTML content from the given URL and parses it to extract
67/// all ZIP file download links. Period identifiers are extracted from filenames
68/// using a regex pattern that matches `_YYYYMM.zip` or similar formats.
69///
70/// # Arguments
71///
72/// * `client` - HTTP client to use for the request
73/// * `input_url` - URL of the page containing ZIP file links (e.g., the minor contracts
74///   or public tenders landing page)
75///
76/// # Returns
77///
78/// A map from period strings (e.g., "202301") to absolute download URLs.
79///
80/// # Errors
81///
82/// Returns an error if:
83/// - The HTTP request fails
84/// - The URL cannot be parsed
85/// - HTML parsing fails
86///
87pub async fn fetch_zip(
88    client: &reqwest::Client,
89    input_url: &str,
90) -> AppResult<BTreeMap<String, String>> {
91    // parse the base URL
92    let base_url = Url::parse(input_url)?;
93
94    // fetch the page content
95    let response = client
96        .get(base_url.as_str())
97        .send()
98        .await?
99        .error_for_status()?
100        .text()
101        .await?;
102
103    parse_zip_links(&response, &base_url)
104}
105
106/// Parses HTML content and extracts ZIP file links, extracting period identifiers from filenames.
107///
108/// This function searches for all `<a>` tags with `href` attributes ending in `.zip`,
109/// extracts period identifiers from filenames using a regex pattern (e.g., `_202301.zip`),
110/// and resolves relative URLs to absolute URLs using the base URL.
111///
112/// # Returns
113///
114/// Returns a map where keys are period strings (e.g., "202301") and values are absolute URLs.
115///
116pub fn parse_zip_links(html: &str, base_url: &Url) -> AppResult<BTreeMap<String, String>> {
117    let document = Html::parse_document(html);
118
119    let mut links: BTreeMap<String, String> = BTreeMap::new();
120
121    let selector = ZIP_LINK_SELECTOR_CACHED.get_or_init(|| {
122        Selector::parse(ZIP_LINK_SELECTOR).expect("ZIP_LINK_SELECTOR is a valid CSS selector")
123    });
124
125    let period_regex = PERIOD_REGEX.get_or_init(|| {
126        Regex::new(PERIOD_REGEX_PATTERN).expect("PERIOD_REGEX_PATTERN is a valid regex pattern")
127    });
128
129    for url in document
130        .select(selector)
131        .filter_map(|el| el.value().attr("href"))
132        .filter_map(|href| base_url.join(href).ok())
133    {
134        if let Some(filename) = url.path_segments().and_then(|mut s| s.next_back()) {
135            if let Some(m) = period_regex.captures(filename).and_then(|c| c.get(1)) {
136                links.insert(m.as_str().to_string(), url.to_string());
137            }
138        }
139    }
140
141    Ok(links)
142}
143
144#[cfg(test)]
145mod tests {
146    use super::parse_zip_links;
147    use url::Url;
148
149    #[test]
150    fn test_parse_zip_links_basic() {
151        let html = r#"
152            <html>
153            <body>
154              <a href="files/data_202301.zip">202301</a>
155              <a href="/downloads/data_202302.zip">202302</a>
156              <a href="https://other.example.com/attachments/data_202303.zip">202303</a>
157              <a href="not_a_zip.txt">skip</a>
158            </body>
159            </html>
160        "#;
161
162        let base = Url::parse("https://example.com/path/").expect("base url");
163        let result = parse_zip_links(html, &base).expect("parse succeeds");
164
165        // Should contain the three detected periods with absolute URLs
166        assert_eq!(
167            result.get("202301").unwrap(),
168            "https://example.com/path/files/data_202301.zip"
169        );
170        assert_eq!(
171            result.get("202302").unwrap(),
172            "https://example.com/downloads/data_202302.zip"
173        );
174        assert_eq!(
175            result.get("202303").unwrap(),
176            "https://other.example.com/attachments/data_202303.zip"
177        );
178    }
179
180    #[test]
181    fn test_parse_zip_links_no_capture() {
182        let html = r#"
183            <html><body>
184              <a href="files/data202301.zip">no underscore</a>
185              <a href="files/data_abc.zip">non-numeric</a>
186            </body></html>
187        "#;
188
189        let base = Url::parse("https://example.com/").expect("base url");
190        let result = parse_zip_links(html, &base).expect("parse succeeds");
191        // No valid numeric captures -> empty
192        assert!(result.is_empty());
193    }
194
195    #[test]
196    fn test_parse_zip_links_multiple_underscores_uses_last_capture() {
197        let html = r#"
198            <html><body>
199              <a href="files/prefix_2023_202301.zip">multi</a>
200            </body></html>
201        "#;
202
203        let base = Url::parse("https://example.com/").expect("base url");
204        let result = parse_zip_links(html, &base).expect("parse succeeds");
205        // Expect to capture the last numeric group (202301)
206        assert_eq!(
207            result.get("202301").unwrap(),
208            "https://example.com/files/prefix_2023_202301.zip"
209        );
210    }
211
212    #[test]
213    fn test_parse_zip_links_duplicate_periods_last_wins() {
214        let html = r#"
215            <html><body>
216              <a href="files/data_202301.zip">first</a>
217              <a href="files/other_202301.zip">second</a>
218            </body></html>
219        "#;
220
221        let base = Url::parse("https://example.com/").expect("base url");
222        let result = parse_zip_links(html, &base).expect("parse succeeds");
223        // BTreeMap insert will keep the last inserted value for the same key
224        assert_eq!(
225            result.get("202301").unwrap(),
226            "https://example.com/files/other_202301.zip"
227        );
228    }
229
230    #[test]
231    fn test_parse_zip_links_relative_paths_resolve() {
232        let html = r#"
233            <html><body>
234              <a href="./files/data_202304.zip">rel</a>
235              <a href="../up/data_202305.zip">up</a>
236            </body></html>
237        "#;
238
239        let base = Url::parse("https://example.com/path/sub/").expect("base url");
240        let result = parse_zip_links(html, &base).expect("parse succeeds");
241        assert_eq!(
242            result.get("202304").unwrap(),
243            "https://example.com/path/sub/files/data_202304.zip"
244        );
245        assert_eq!(
246            result.get("202305").unwrap(),
247            "https://example.com/path/up/data_202305.zip"
248        );
249    }
250}