commit 36e10cf8d29a86e10dae711dddef60b1ea668059 Author: Nathan Lamy Date: Thu Aug 21 13:49:30 2025 +0200 Initial commit :) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..007e632 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +/prod +config.toml +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..92d1e2f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "worker" +version = "0.1.0" +edition = "2024" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +reqwest = { version = "0.12", features = ["blocking", "json"] } +scraper = "0.18" +tokio = { version = "1", features = ["full"] } +chrono = { version = "0.4", features = ["serde", "clock"] } +url = "2.5.4" +redis = { version = "0.32.3", features = ["tokio-comp"] } +pdf-extract = "0.9.0" +tempfile = "3.20.0" +config = "0.15.13" +tokio-cron-scheduler = "0.14.0" +sha2 = "0.10.9" +cron = "0.15.0" diff --git a/src/api.rs b/src/api.rs new file mode 100644 index 0000000..0c089e2 --- /dev/null +++ b/src/api.rs @@ -0,0 +1,92 @@ +use chrono::NaiveDateTime; +use reqwest::{self, Client}; +use serde_json::json; + +use crate::configuration::{Settings, get_config}; + +pub async fn post_colle( + colle: &Colle, + class_name: &str, + config: &Settings, +) -> Result<(), Box> { + let api_url = get_config(config, "api"); + let api_token = get_config(config, "token"); + + let url = format!("{api_url}/colles"); + let colle_json = json!({ + "colle": &colle, + "className": class_name, + }); + + let response = Client::new() + .post(&url) + .json(&colle_json) + .header("Content-Type", "application/json") + .header("Accept", "application/json") + // Bearer token for authentication + .header("Authorization", format!("Bearer {api_token}")) + .send() + .await?; + // Check if the response is successful + if !response.status().is_success() { + eprintln!("Failed to post colle: HTTP {}", response.status(),); + eprint!("Response: {:?}", response.text().await?); + return Err("Failed to post colle".into()); + } + + Ok(()) +} + +pub async fn post_upcoming_colles( + colles: &[Colle], + class_name: &str, + config: &Settings, +) -> Result<(), Box> { + let api_url = get_config(config, "api"); + let api_token = get_config(config, "token"); + + let url = format!("{api_url}/colles"); + let colles_json = json!({ + "colles": &colles, + "className": class_name, + }); + + let response = Client::new() + .post(&url) + .json(&colles_json) + .header("Content-Type", "application/json") + .header("Accept", "application/json") + // Bearer token for authentication + .header("Authorization", format!("Bearer {api_token}")) + .send() + .await?; + // Check if the response is successful + if !response.status().is_success() { + eprintln!("Failed to post upcoming colles: HTTP {}", response.status(),); + eprint!("Response: {:?}", response.text().await?); + return Err("Failed to post upcoming colles".into()); + } + + Ok(()) +} + +#[derive(Debug, serde::Serialize)] +pub struct Colle { + pub date: NaiveDateTime, + pub examiner: String, + pub room: String, + pub subject: String, + pub student: String, + pub bjid: Option, + pub bjsecret: Option, + pub grade: Option, + pub content: Option, + pub comment: Option, + pub attachments: Option>, // (file_url, file_name) +} + +#[derive(Debug, serde::Serialize)] +pub struct ColleAttachment { + pub url: String, + pub name: String, +} diff --git a/src/configuration.rs b/src/configuration.rs new file mode 100644 index 0000000..4f37f65 --- /dev/null +++ b/src/configuration.rs @@ -0,0 +1,77 @@ +use config::{Config, ConfigError, File}; +use serde::Deserialize; +use std::collections::HashMap; + +#[derive(Debug, Deserialize)] +pub struct Account { + pub username: String, + pub password: String, +} + +#[derive(Debug, Deserialize)] +pub struct Settings { + redis: Option>, + api: Option>, + accounts: Option>, + cron: Option>>, +} + +pub fn load_config() -> Result { + let settings = Config::builder() + .add_source(File::with_name("config")) + .build()? + .try_deserialize::()?; + + Ok(settings) +} + +pub fn get_config(config: &Settings, section: &str) -> String { + match section { + "redis" => { + if let Some(redis_map) = &config.redis { + redis_map.get("url").cloned().unwrap_or_default() + } else { + println!("Redis configuration not found."); + String::new() + } + } + "api" => { + if let Some(api_map) = &config.api { + api_map.get("url").cloned().unwrap_or_default() + } else { + String::new() + } + } + "token" => { + if let Some(api_map) = &config.api { + api_map.get("token").cloned().unwrap_or_default() + } else { + String::new() + } + } + _ => String::new(), + } +} + +pub fn get_cron(config: &Settings, section: &str) -> Vec { + if let Some(map) = &config.cron { + map.get(section).cloned().unwrap_or_default() + } else { + Vec::new() + } +} + +pub fn get_account_by_class<'a>(config: &'a Settings, class_name: &str) -> Option<&'a Account> { + config + .accounts + .as_ref() + .and_then(|accounts| accounts.get(class_name)) +} + +pub fn list_classes(config: &Settings) -> Vec { + config + .accounts + .as_ref() + .map(|accounts| accounts.keys().cloned().collect()) + .unwrap_or_default() +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..eec1e99 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,303 @@ +use std::{str::FromStr, sync::Arc}; + +use crate::{ + api::{post_colle, post_upcoming_colles}, + configuration::{get_config, get_cron, list_classes, load_config}, + parser::{authenticate, fetch_class_colles, fetch_colle, fetch_upcoming_colles}, +}; +use chrono::{DateTime, Utc}; +use cron::Schedule; +use redis::AsyncCommands; +use redis::aio::MultiplexedConnection; +use redis::{Client, Connection, RedisError, RedisResult, TypedCommands}; +use serde::Serialize; +use serde_json::Value; +use tokio::sync::Mutex; +use tokio_cron_scheduler::{Job, JobScheduler}; +mod api; +mod configuration; +mod parser; + +#[tokio::main] +async fn main() -> redis::RedisResult<()> { + // Load configuration + let config = load_config(); + if config.is_err() { + eprintln!("Failed to load configuration: {}", config.unwrap_err()); + return Err(redis::RedisError::from(( + redis::ErrorKind::InvalidClientConfig, + "Configuration loading failed", + ))); + } + let config = config.unwrap(); + println!("Configuration loaded successfully"); + + // Connect to Redis + let redis_url = get_config(&config, "redis"); + let client = Client::open(redis_url)?; + + // Create separate connections for pubsub and job processing + let mut pubsub_con = client.get_connection()?; + let mut job_con = client.get_connection()?; + let scheduler_con = client.get_multiplexed_async_connection().await.unwrap(); + let mut pubsub = pubsub_con.as_pubsub(); + pubsub.subscribe("jobs_queue")?; + println!("Connected to Redis pubsub channel 'jobs_queue'"); + + // Schedule cron jobs + schedule_cron_jobs(&config, scheduler_con).await; + println!("Cron jobs scheduled successfully"); + + loop { + let msg = pubsub.get_message()?; + let payload: String = msg.get_payload()?; + if let Ok(event) = serde_json::from_str::(&payload) { + match process_job(&event, &mut job_con, &config).await { + Ok(_) => { + println!("Job processed successfully: {:?}", event); + } + Err(e) => { + eprintln!("Error processing job: {:?}, Error: {}", event, e); + } + } + } + } +} + +#[derive(Serialize)] +struct JobMessage { + r#type: u8, + class_name: String, +} + +async fn schedule_cron_jobs(config: &configuration::Settings, con: MultiplexedConnection) { + let scheduler = JobScheduler::new().await.unwrap(); + let classes: Vec = list_classes(config); + let class_colles_crons = get_cron(config, "class_colles"); + let upcoming_colles_crons = get_cron(config, "upcoming_colles"); + + let con = Arc::new(Mutex::new(con)); + + for class_name in classes { + // Start job for upcoming colles + let message = JobMessage { + r#type: 2, + class_name: class_name.clone(), + }; + for cron in &upcoming_colles_crons { + let job = start_job(&message, &cron, Arc::clone(&con)).await; + scheduler.add(job).await.unwrap(); + } + println!("Scheduled job for upcoming colles: {}", class_name); + // Start job for class colles + let message = JobMessage { + r#type: 1, + class_name: class_name.clone(), + }; + for cron in &class_colles_crons { + let job = start_job(&message, &cron, Arc::clone(&con)).await; + scheduler.add(job).await.unwrap(); + } + println!("Scheduled job for class colles: {}", class_name); + } + + scheduler.start().await.unwrap(); +} + +async fn start_job( + message: &JobMessage, + cron: &str, + con: Arc>, +) -> Job { + let con_clone = Arc::clone(&con); + let payload = serde_json::to_string(message).unwrap(); + + // Register cron job + Job::new_async(&cron, move |_uuid, _l| { + let payload = payload.clone(); + let con_clone = Arc::clone(&con_clone); + Box::pin(async move { + let mut conn = con_clone.lock().await; + let res: Result<(), RedisError> = conn.publish("jobs_queue", payload).await; + if res.is_err() { + eprintln!("Failed to publish job: {}", res.unwrap_err()); + } + }) + }) + .unwrap() +} + +async fn process_job( + job: &Value, + con: &mut Connection, + config: &configuration::Settings, +) -> RedisResult<()> { + // Retrieve the class name from the job + println!("Processing job: {:?}", job); + let class_name = job["class_name"].as_str(); + if class_name.is_none() { + eprintln!("Job does not contain a class name."); + return Err(redis::RedisError::from(( + redis::ErrorKind::InvalidClientConfig, + "Job missing class name", + ))); + } + let class_name = class_name.unwrap(); + + /* + * Fetch ONE colle (id and secret) + */ + if job["type"] == 0 { + // Simulate fetching colles + let colle_id = job["colle_id"].as_str().unwrap_or("unknown"); + let colle_secret = job["colle_secret"].as_str().unwrap_or("unknown"); + println!("Fetched colle: ID={}, Secret={}", colle_id, colle_secret); + + match fetch_colle(colle_id, colle_secret).await { + Ok(colle) => { + if colle.is_none() { + eprintln!("No colle found for ID: {}", colle_id); + return Ok(()); + } + let mut colle = colle.unwrap(); + if colle.examiner.is_empty() { + colle.examiner = job["examiner_name"] + .as_str() + .unwrap_or("Unknown") + .to_string(); + } + let res = post_colle(&colle, class_name, config).await; + if res.is_err() { + eprintln!("Failed to post colle: {}", res.unwrap_err()); + return Err(redis::RedisError::from(( + redis::ErrorKind::ResponseError, + "Failed to post colle", + ))); + } + } + Err(e) => { + eprintln!("Failed to fetch colle {}: {}", colle_id, e); + } + } + } else { + // Authenticate + let session = authenticate(class_name, &mut *con, config).await; + if session.is_err() { + eprintln!("Failed to authenticate: {}", session.unwrap_err()); + return Err(redis::RedisError::from(( + redis::ErrorKind::AuthenticationFailed, + "Authentication failed", + ))); + } + let session = session.unwrap(); + println!("Authenticated successfully: {}", session); + + /* + * Fetch class colles (class name and optionally date) + */ + if job["type"] == 1 { + let last_hash_key = format!("class_colles_hash_{}", class_name); + let last_hash = get_last_hash(con, &last_hash_key); + + // Fetch class colles + match fetch_class_colles(&session, job["date"].as_str().or(None), &last_hash).await { + Ok((colles, hash)) => { + set_last_hash(con, &last_hash_key, &hash)?; + // Add each colle to Redis queue pubsub + for (colle_id, colle_secret, examiner_name) in colles { + let colle_job = serde_json::json!({ + "type": 0, + "colle_id": colle_id, + "colle_secret": colle_secret, + "class_name": class_name, + "examiner_name": examiner_name, + }); + let colle_job_str = + serde_json::to_string(&colle_job).unwrap_or_else(|_| "{}".to_string()); + + // Publish the colle job to the Redis channel + con.publish("jobs_queue", &colle_job_str)?; + println!("Published colle job: {}", &colle_job_str); + } + + // Set the last sync date in Redis + let last_sync_key = format!("last_sync_{}", class_name); + con.set(&last_sync_key, Utc::now().to_rfc3339().as_str())?; + + let class_colles_crons = get_cron(config, "class_colles"); + let healthy_until = next_run_any(&class_colles_crons).unwrap_or_default(); + let healhy_until_key = format!("healthy_until_{}", class_name); + con.set(&healhy_until_key, healthy_until.to_rfc3339().as_str())?; + } + Err(e) => { + eprintln!("Failed to fetch class colles: {}", e); + } + } + /* + * Fetch upcoming colles (class name) + */ + } else if job["type"] == 2 { + // Retrieve the last hash from Redis and fetch upcoming colles + let last_hash_key = format!("upcoming_colles_hash_{}", class_name); + let last_hash = get_last_hash(con, &last_hash_key); + + let (colles, hash) = fetch_upcoming_colles(&session, &last_hash).await; + set_last_hash(con, &last_hash_key, &hash)?; + + if colles.is_empty() { + return Ok(()); + } + let res = post_upcoming_colles(&colles, class_name, config).await; + if res.is_err() { + eprintln!("Failed to post upcoming colles: {}", res.unwrap_err()); + return Err(redis::RedisError::from(( + redis::ErrorKind::ResponseError, + "Failed to post upcoming colles", + ))); + } + println!("Posted upcoming colles successfully."); + /* + * Handle unknown job types + * This is a catch-all for any job types that are not recognized. + */ + } else { + eprintln!("Unknown job type: {}", job["type"]); + } + } + + Ok(()) +} + +fn get_last_hash(con: &mut Connection, key: &str) -> String { + let last_hash = con.get(key); + // If the last hash is not found, use an empty string + let last_hash = match last_hash { + Ok(hash) => hash.unwrap_or_else(|| String::new()), + Err(e) => { + eprintln!("Failed to get last hash: {}", e); + String::new() + } + }; + last_hash +} + +fn set_last_hash(con: &mut Connection, key: &str, hash: &str) -> RedisResult<()> { + con.set(key, hash)?; + Ok(()) +} + +fn next_run_any(expressions: I) -> Option> +where + I: IntoIterator, + I::Item: AsRef, +{ + let now = Utc::now(); + + expressions + .into_iter() + .filter_map(|expr| { + let schedule = Schedule::from_str(expr.as_ref()).ok()?; + schedule.after(&now).next() + }) + .min() +} diff --git a/src/parser/auth.rs b/src/parser/auth.rs new file mode 100644 index 0000000..43ee4de --- /dev/null +++ b/src/parser/auth.rs @@ -0,0 +1,169 @@ +/** + * Authenticate on BJColle. + */ +use crate::{configuration::{get_account_by_class, Settings}}; +use redis::{Commands, Connection, RedisError, RedisResult}; +use reqwest::{self, header::HeaderValue, redirect::Policy}; + +pub async fn request_session() -> Result> { + let url = "https://bjcolle.fr/acces.php"; + + let response = reqwest::get(url).await?; + // Get response headers + let session_id = response.headers().get("set-cookie"); + if !session_id.is_some() { + return Err("Failed to get session ID".into()); + } + Ok(session_id.unwrap().to_str()?.to_string()) +} + +pub async fn login( + username: &str, + password: &str, + session_id: &str, +) -> Result> { + let url = "https://bjcolle.fr/acces.php"; + + let client = reqwest::Client::builder() + .redirect(Policy::none()) + .build()?; + let response = client + .post(url) + .header("Cookie", session_id) + .form(&[ + ("USERNAME_ACCES", username), + ("PASSWORD_ACCES", password), + ("SOUVENIR", "on"), + ("valider_ident", "Valider"), + ]) + .send() + .await?; + + // Get session ID from cookies + let session_id: Vec = response + .headers() + .get_all("set-cookie") + .iter() + .filter_map(|value: &HeaderValue| value.to_str().ok()) + .filter_map(|s| { + // Get the part before the first semicolon + let first_part = s.split(';').next()?.trim(); + + // Only keep "bjid" or "bjp" + if first_part.starts_with("bjid=") || first_part.starts_with("bjp=") { + Some(first_part.to_string()) + } else { + None + } + }) + .collect(); + + if session_id.is_empty() { + return Err("Failed to get session ID".into()); + } + + Ok(session_id.join("; ")) +} + +pub async fn refresh_session(cookie: &str) -> Result> { + let url = "https://bjcolle.fr/acces_cookies.php"; + + let client = reqwest::Client::builder() + .redirect(Policy::none()) + .build()?; + let response = client.get(url).header("Cookie", cookie).send().await?; + + // Get session ID from cookies + let new_session_id = response.headers().get("set-cookie"); + if !new_session_id.is_some() { + return Err("Failed to refresh session ID".into()); + } + + Ok(new_session_id.unwrap().to_str()?.to_string()) +} + +pub async fn authenticate( + class_name: &str, + con: &mut Connection, + config: &Settings, +) -> RedisResult { + // Try reusing an existing session if available + let session_key = format!("session_{}", class_name); + let session_duration = 900; // 15 minutes + let session_id: RedisResult> = con.get(&session_key); + match session_id? { + Some(id) => { + println!("Reusing existing session: {}", id); + return Ok(id); + } + None => { + println!("No existing session found for class: {}", class_name); + } + } + + // Request a session ID (valid for 15 minutes) + let session = request_session().await; + if let Err(err) = session { + eprintln!("Failed to request session: {}", err); + return Err(redis::RedisError::from(( + redis::ErrorKind::AuthenticationFailed, + "Session request failed", + ))); + } + let session_id = session.unwrap(); + println!("Session requested: {}", session_id); + + // Try to get cached session ID + let cookie_key = format!("auth_cookie_{}", class_name); + let cached_session: RedisResult> = con.get(&cookie_key); + + match cached_session? { + Some(cookie) => { + // Attempt to refresh session + let refreshed = refresh_session(&cookie).await; + if let Err(err) = refreshed { + eprintln!("Failed to refresh session: {}", err); + // Remove the cached cookie if refresh fails and retry authentication + let _: Result<(), RedisError> = con.del(&cookie_key); + return Box::pin(authenticate(class_name, con, config)).await; + } + + let new_session_id = refreshed.unwrap(); + println!("Session refreshed: {}", new_session_id); + + // Set the new session id (for 15 minutes) + let _: Result<(), RedisError> = con.set_ex(&session_key, &new_session_id, session_duration); + Ok(new_session_id) + } + + None => { + // No cookie found, authenticate with credentials + let account = get_account_by_class(&config, class_name); + if account.is_none() { + eprintln!("No account found for class: {}", class_name); + return Err(redis::RedisError::from(( + redis::ErrorKind::AuthenticationFailed, + "No account found", + ))); + } + let account = account.unwrap(); + println!("Using account: {}", account.username); + + let login_result = login(&account.username, &account.password, &session_id).await; + if let Err(err) = login_result { + eprintln!("Failed to login: {}", err); + return Err(redis::RedisError::from(( + redis::ErrorKind::AuthenticationFailed, + "Login failed", + ))); + } + let cookie = login_result.unwrap(); + println!("Login successful: {}", cookie); + + let _: Result<(), RedisError> = con.set(&cookie_key, cookie.clone()); + // Set the new session id (for 15 minutes) + let _: Result<(), RedisError> = con.set_ex(&session_key, &session_id, session_duration); + Ok(session_id) + } + } +} diff --git a/src/parser/colles/class.rs b/src/parser/colles/class.rs new file mode 100644 index 0000000..3a9b965 --- /dev/null +++ b/src/parser/colles/class.rs @@ -0,0 +1,75 @@ +/** + * List class colles from BJColle (list ID and secret). + * URL: https://bjcolle.fr/students_dashboard_class.php + */ +use reqwest::{self, Client}; +use scraper::{Html, Selector}; +use std::collections::HashMap; +use url::Url; + +use crate::parser::utils::hash_text; + +pub async fn fetch( + session: &str, + date: Option<&str>, + last_hash: &str, +) -> Result<(Vec<(String, String, String)>, String), Box> { + let mut url = String::from("https://bjcolle.fr/students_dashboard_class.php?erasedate=1"); + if let Some(date) = date { + url = format!( + "https://bjcolle.fr/students_dashboard_class.php?go={}", + date + ); + } + + // Retrieve page content + let response = Client::new() + .get(url) + .header("Cookie", session) + .send() + .await? + .text() + .await?; + + // Calculate the hash of the response + let hash = hash_text(&response); + if hash == last_hash { + return Ok((Vec::new(), hash)); + } + + let document = Html::parse_document(&response); + + // Select the table rows containing colles + let row_selector = Selector::parse("#Choix > a.bouton_eleve2").unwrap(); + + let mut colles = Vec::new(); + + for row in document.select(&row_selector) { + // Extract ID and secret from each row + if let Some(href) = row.value().attr("href") { + if let Some((colle_id, colle_secret)) = parse_colle_info(href) { + let rows: Vec<&str> = row.text().collect(); + let examiner_name = rows[2].trim().to_string(); + colles.push((colle_id, colle_secret, examiner_name)); + } else { + eprintln!("Failed to parse colle info from URL: {}", href); + } + } else { + eprintln!("Row does not contain a valid href attribute."); + } + } + + Ok((colles, hash)) +} + +fn parse_colle_info(raw_url: &str) -> Option<(String, String)> { + let full_url = format!("https://dummy.host/{}", raw_url); // must be absolute + let parsed = Url::parse(&full_url).ok()?; + + let query: HashMap<_, _> = parsed.query_pairs().into_owned().collect(); + + let colle_id = query.get("colle")?.to_string(); + let colle_secret = query.get("hgfebrgl8ri3h")?.to_string(); + + Some((colle_id, colle_secret)) +} diff --git a/src/parser/colles/fetch.rs b/src/parser/colles/fetch.rs new file mode 100644 index 0000000..ce4f55e --- /dev/null +++ b/src/parser/colles/fetch.rs @@ -0,0 +1,163 @@ +/* + Request a colle from BJColle given its ID (and secret). + URL : https://bjcolle.fr/students_oral_disp.php?colle=&hgfebrgl8ri3h= +*/ + +use crate::{api::{Colle, ColleAttachment}, parser::utils}; +use reqwest; +use scraper::{ElementRef, Html, Selector}; + +pub async fn fetch( + colle_id: &str, + secret: &str, +) -> Result, Box> { + // Retrieve page content + let url = format!( + "https://bjcolle.fr/students_oral_disp.php?colle={}&hgfebrgl8ri3h={}", + colle_id, secret + ); + let response = reqwest::get(url).await?.text().await?; + + let document = Html::parse_document(&response); + + let header_selector = "#envelope > header > table > tbody h2"; + let header_selector = Selector::parse(header_selector).unwrap(); + + // Expect ONLY ONE header + let Some(header) = document.select(&header_selector).next() else { + // Header not found error + eprint!("Header not found in the document."); + return Ok(None); + }; + + // Use the header text to extract informations : + // - subject + // - examiner + // - date + // - room + // - student name + let header_text = header.text().collect::>(); + let (title, date, room, student) = if header_text.len() >= 4 { + ( + header_text[0].trim().to_string(), + header_text[1].trim().to_string(), + header_text[2].trim().to_string(), + header_text[3].trim().to_string(), + ) + } else { + // TODO: Handle the case where no header is found + print!("Header text does not contain enough elements."); + return Ok(None); + }; + + let (subject, examiner) = title + .split("avec") + .map(|s| s.trim()) + .collect::>() + .split_first() + .map(|(subject, examiner)| { + ( + subject + .to_string() + .replace("Colle de", "") + .replace("Colle d'", "") + .trim() + .to_string(), + examiner.join(" "), + ) + }) + .unwrap_or((String::new(), String::new())); + + // Parse the date and room + let date = utils::parse_french_datetime(&date)?; + let room = room.replace("Salle :", "").trim().to_string(); + + // Parse grades and comments + let grade_selector = "#haut > div > input#NOTE_ELEVE"; + let grade_selector = Selector::parse(grade_selector).unwrap(); + + let grade = document + .select(&grade_selector) + .next() + .and_then(|e| e.value().attr("value")) + .unwrap_or("") + .to_string() + .replace(",", ".") + .parse() + .unwrap_or(-1.0); + + // Attachments + let attachment_selector = Selector::parse(".bj > a.bouton_eleve").unwrap(); + let files = document + .select(&attachment_selector) + .filter_map(|el| { + let href = el.value().attr("href")?.to_string(); + let text = el.text().collect::().trim().to_string(); + Some(ColleAttachment { + url: href, + name: text, + }) + }) + .collect(); + + // Return a JSON object with the parsed information + Ok(Some(Colle { + date, + room, + student, + subject, + examiner, + bjid: Some(colle_id.to_string()), + bjsecret: Some(secret.to_string()), + grade: if grade >= 0.0 { Some(grade) } else { None }, + content: Some(extract_section(&document, "Sujet", true)), + comment: Some(extract_section(&document, "Commentaire", false)), + attachments: Some(files), + })) +} + +fn extract_section(document: &Html, keyword: &str, use_first_span: bool) -> String { + let legend_selector = Selector::parse("fieldset.bj > legend.bj2").unwrap(); + let span_selector = Selector::parse("fieldset.bj > span").unwrap(); + + // Check if legend contains the keyword ("Sujet" or "Commentaire") + let is_match = document + .select(&legend_selector) + .any(|el| el.text().collect::().contains(keyword)); + + if !is_match { + return String::new(); + } + + // Collect all matching spans + let spans: Vec<_> = document.select(&span_selector).collect(); + + let span_el = if spans.is_empty() { + None + } else if use_first_span { + Some(spans[0].clone()) + } else { + Some(spans[spans.len() - 1].clone()) + }; + + let mut results = Vec::new(); + + if let Some(span) = span_el { + // Filter children !=
+ let children: Vec<_> = span + .children() + .filter_map(ElementRef::wrap) + .filter(|el| el.value().name() != "br") + .collect(); + + if !children.is_empty() { + for child in children { + results.push(child.html()); + } + } else { + results.push(span.html()); + } + } + + utils::clean_content(&results.join("
")) +} diff --git a/src/parser/colles/mod.rs b/src/parser/colles/mod.rs new file mode 100644 index 0000000..5e01722 --- /dev/null +++ b/src/parser/colles/mod.rs @@ -0,0 +1,7 @@ +mod class; +mod fetch; +mod upcoming; + +pub use class::fetch as fetch_class_colles; +pub use fetch::fetch as fetch_colle; +pub use upcoming::fetch as fetch_upcoming_colles; diff --git a/src/parser/colles/upcoming.rs b/src/parser/colles/upcoming.rs new file mode 100644 index 0000000..5652034 --- /dev/null +++ b/src/parser/colles/upcoming.rs @@ -0,0 +1,191 @@ +use crate::{ + api::Colle, + parser::utils::{hash_text, parse_french_date, with_time}, +}; +use chrono::{Datelike, NaiveDate}; +use reqwest::Client; +use std::{collections::HashMap, io::Write}; +use tempfile::NamedTempFile; + +// Splits a vector of strings into a HashMap based on dates. +// The keys are the dates, and the values are vectors of strings containing the lines after that +fn split_dates(text: Vec<&str>) -> HashMap> { + // Get the current year + let current_year = chrono::Utc::now().year(); + let mut result = HashMap::new(); + + let mut content = Vec::new(); + for line in text { + // Check if the line contains a year + if line.contains(¤t_year.to_string()) { + let date = parse_french_date(line); + if date.is_ok() { + if !content.is_empty() { + result.insert(date.unwrap(), content.clone()); + content.clear(); + } + } else { + // If parsing failed, just continue + eprintln!("Failed to parse date from line: {}", line); + } + } else { + content.push(line.to_string()); + } + } + // Return the result as a vector of strings + result +} + +// Check if the line starts with a time format (e.g., "16h20") +fn is_time(line: &str) -> bool { + let trimmed = line.trim(); + + if let Some(h_pos) = trimmed.find('h') { + // Check if there are only digits before 'h' + let before_h = &trimmed[..h_pos]; + if !before_h.is_empty() && before_h.chars().all(|c| c.is_digit(10)) { + // Check that there are exactly 2 digits after 'h' + let after_h = &trimmed[h_pos + 1..]; + return after_h.len() >= 2 && after_h.chars().take(2).all(|c| c.is_digit(10)); + } + } + + false +} + +// Check if the line starts with a last name (uppercase first word) +fn is_name(line: &str) -> bool { + let trimmed = line.trim(); + if let Some(first_space) = trimmed.find(' ') { + let first_word = &trimmed[..first_space]; + return first_word.chars().all(|c| c.is_uppercase() || c == '-'); + } + false +} + +// Extract all names from a line (split on "/" and trim each part) +fn extract_names(line: &str) -> Vec { + line.split('/') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + +// Parse header (time, teacher, room) +fn parse_header(line: &str) -> (String, String, String) { + // First 5 characters are the time + let time = line.get(0..5).unwrap_or("").to_string(); + // Split the rest on "Salle :" + let parts: Vec<&str> = line[5..].split("Salle :").collect(); + let teacher = parts.get(0).map_or("", |s| s.trim()).to_string(); + let room = parts.get(1).map_or("", |s| s.trim()).to_string(); + (time, teacher, room) +} + +// Parse the PDF text into a vector of Upcoming colles +fn parse_upcoming(text: Vec<&str>) -> Vec { + let dates = split_dates(text); + let mut result = Vec::new(); + + for (date, lines) in dates { + let mut current_header = (String::new(), String::new(), String::new()); + let mut current_subject = String::new(); + + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + + if is_time(trimmed) { + current_header = parse_header(trimmed); + } else if is_name(trimmed) { + let names = extract_names(trimmed); + // Pre-allocate and extend instead of pushing individual items + let entries: Vec = names + .into_iter() + .map(|name| Colle { + date: with_time(date, ¤t_header.0).unwrap_or_default(), + examiner: current_header.1.clone(), + room: current_header.2.clone(), + subject: current_subject.clone(), + student: name, + bjid: None, + bjsecret: None, + grade: None, + content: None, + comment: None, + attachments: None, + }) + .collect(); + result.extend(entries); + } else { + current_subject = trimmed.to_string(); + } + } + } + + result +} + +pub async fn fetch(session: &str, last_hash: &str) -> (Vec, String) { + let url = "https://bjcolle.fr/oral_choice_week_billboard_cdt.php"; + // Start date is 3 days ago, end date is 14 days from now (2 weeks) + let start_date = chrono::Utc::now().date_naive() - chrono::Duration::days(3); + let end_date = chrono::Utc::now().date_naive() + chrono::Duration::days(14); + + let response = Client::new() + .post(url) + .header("Cookie", session) + .form(&[ + ("datepicker_1", &start_date.format("%d/%m/%Y").to_string()), + ("datepicker_2", &end_date.format("%d/%m/%Y").to_string()), + ("VALIDER_SEMAINE", &"Valider".to_string()), + ]) + .send() + .await + .expect("Failed to send request"); + + if !response.status().is_success() { + eprintln!( + "Failed to fetch upcoming colles: HTTP {}", + response.status() + ); + return (Vec::new(), String::new()); + } + + // Save PDF to a temporary file + let body = response + .bytes() + .await + .expect("Failed to read response body"); + let mut tmpfile = NamedTempFile::new().expect("Failed to create temp file"); + tmpfile + .write_all(&body) + .expect("Failed to write PDF to temp file"); + + // Extract text using pdf_extract + let pdf_text = pdf_extract::extract_text(tmpfile.path()).expect("Failed to extract PDF text"); + // Calculate the hash of the PDF file + let hash = hash_text(&pdf_text); + if hash == last_hash || pdf_text.contains("Aucune colle") { + return (Vec::new(), hash); + } + + // Split the text into lines and filter out empty lines + let lines: Vec<&str> = pdf_text + .lines() + .filter(|line| !line.trim().is_empty()) + .collect(); + // Remove the first line + let lines: Vec<&str> = lines.into_iter().skip(1).collect(); + + // Parse the lines into Upcoming structs + let upcoming_colles = parse_upcoming(lines); + if upcoming_colles.is_empty() { + eprintln!("No upcoming colles found."); + } else { + println!("Found {} upcoming colles.", upcoming_colles.len()); + } + (upcoming_colles, hash) +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..771752d --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,9 @@ +mod auth; +mod utils; +mod colles; + +pub use colles::fetch_class_colles; +pub use colles::fetch_colle; +pub use colles::fetch_upcoming_colles; + +pub use auth::authenticate; diff --git a/src/parser/utils.rs b/src/parser/utils.rs new file mode 100644 index 0000000..2343d62 --- /dev/null +++ b/src/parser/utils.rs @@ -0,0 +1,101 @@ +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use sha2::{Digest, Sha256}; +use std::collections::HashMap; +use std::error::Error; + +pub fn parse_french_date(french_date: &str) -> Result> { + let months: HashMap<&str, u32> = [ + ("janvier", 1), + ("février", 2), + ("mars", 3), + ("avril", 4), + ("mai", 5), + ("juin", 6), + ("juillet", 7), + ("août", 8), + ("septembre", 9), + ("octobre", 10), + ("novembre", 11), + ("décembre", 12), + ] + .iter() + .cloned() + .collect(); + + // Split by spaces + let parts: Vec<&str> = french_date.split_whitespace().collect(); + if parts.len() < 4 { + return Err("Date string too short".into()); + } + + // parts layout: ["vendredi", "6", "juin", "2025"] + // Extract day (digits only) + let day: u32 = parts[1] + .chars() + .filter(|c| c.is_digit(10)) + .collect::() + .parse()?; + + // Month number + let month = months + .get(parts[2]) + .ok_or_else(|| format!("Unknown month: {}", parts[2]))?; + + // Year + let year: i32 = parts[3].parse()?; + + let date = NaiveDate::from_ymd_opt(year, *month, day).ok_or("Invalid date components")?; + + Ok(date) +} + +pub fn parse_french_datetime(french_date: &str) -> Result> { + let parts: Vec<&str> = french_date.split_whitespace().collect(); + if parts.len() < 8 { + return Err("DateTime string too short".into()); + } + + // Parse the date part using the existing function + let date_part = parts[0..4].join(" "); + let date = parse_french_date(&date_part)?; + + // parts layout: ["vendredi", "6", "juin", "2025", "à", "19", "h", "00"] + // Extract hour and minute + let hour: u32 = parts[5].parse()?; + let minute: u32 = parts[7].parse()?; + + let datetime = date + .and_hms_opt(hour, minute, 0) + .ok_or("Invalid time components")?; + + Ok(datetime) +} + +pub fn clean_content(content: &str) -> String { + // Remove HTML tags and decode HTML entities + let cleaned = content + .replace("

", "") + .replace("

", "") + .replace(" ", " ") + .trim() + .to_string(); + cleaned +} + +pub fn with_time(date: NaiveDate, time_str: &str) -> Result { + // Replace 'h' with ':' to make parsing easier + let clean_time = time_str.replace('h', ":"); + + // Parse the time string into NaiveTime + match NaiveTime::parse_from_str(&clean_time, "%H:%M") { + Ok(time) => Ok(date.and_time(time)), + Err(_) => Err(format!("Invalid time format: {}", time_str)), + } +} + +pub fn hash_text(text: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(text.as_bytes()); + let result = hasher.finalize(); + format!("{:x}", result) +}