worker/src/parser/utils.rs
2025-09-12 20:34:01 +02:00

106 lines
2.9 KiB
Rust

use chrono::{Datelike, NaiveDate, NaiveDateTime, NaiveTime};
use sha2::{Digest, Sha256};
use std::collections::HashMap;
use std::error::Error;
pub fn parse_french_date(french_date: &str) -> Result<NaiveDate, Box<dyn Error>> {
let months: HashMap<&str, u32> = [
("janvier", 1),
("février", 2),
("mars", 3),
("avril", 4),
("mai", 5),
("juin", 6),
("juillet", 7),
("août", 8),
("septembre", 9),
("octobre", 10),
("novembre", 11),
("décembre", 12),
]
.iter()
.cloned()
.collect();
// Split by spaces
let parts: Vec<&str> = french_date.split_whitespace().collect();
if parts.len() < 3 {
return Err("Date string too short".into());
}
// parts layout: ["vendredi", "6", "juin", "2025"]
// Extract day (digits only)
let day: u32 = parts[1]
.chars()
.filter(|c| c.is_digit(10))
.collect::<String>()
.parse()?;
// Month number
let month = months
.get(parts[2])
.ok_or_else(|| format!("Unknown month: {}", parts[2]))?;
// Year
let year = if let Some(s) = parts.get(3) {
s.parse()?
} else {
chrono::Local::now().year()
};
let date = NaiveDate::from_ymd_opt(year, *month, day).ok_or("Invalid date components")?;
Ok(date)
}
pub fn parse_french_datetime(french_date: &str) -> Result<NaiveDateTime, Box<dyn Error>> {
let parts: Vec<&str> = french_date.split_whitespace().collect();
if parts.len() < 8 {
return Err("DateTime string too short".into());
}
// Parse the date part using the existing function
let date_part = parts[0..4].join(" ");
let date = parse_french_date(&date_part)?;
// parts layout: ["vendredi", "6", "juin", "2025", "à", "19", "h", "00"]
// Extract hour and minute
let hour: u32 = parts[5].parse()?;
let minute: u32 = parts[7].parse()?;
let datetime = date
.and_hms_opt(hour, minute, 0)
.ok_or("Invalid time components")?;
Ok(datetime)
}
pub fn clean_content(content: &str) -> String {
// Remove HTML tags and decode HTML entities
let cleaned = content
.replace("<p>", "")
.replace("</p>", "")
.replace("&nbsp;", " ")
.trim()
.to_string();
cleaned
}
pub fn with_time(date: NaiveDate, time_str: &str) -> Result<NaiveDateTime, String> {
// Replace 'h' with ':' to make parsing easier
let clean_time = time_str.replace('h', ":");
println!("TIME: {}", time_str);
// Parse the time string into NaiveTime
match NaiveTime::parse_from_str(&clean_time, "%H:%M") {
Ok(time) => Ok(date.and_time(time)),
Err(_) => Err(format!("Invalid time format: {}", time_str)),
}
}
pub fn hash_text(text: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(text.as_bytes());
let result = hasher.finalize();
format!("{:x}", result)
}