import requests import pandas as pd import csv # --- CONFIGURATION --- # IMPORTANT: Replace this with the full URL for your Elvanto report, # including the authkey. REPORT_URL = "https://cairnspc.elvanto.com.au/report/?id=c5ab005a-f2be-403e-84a5-85870ac4a41b&authkey=lguYFfS9" # The name of the file where the data will be saved. OUTPUT_FILE = "scraped_elvanto_report.csv" # This mapping helps rename columns from the Elvanto HTML report # to the desired CSV column names. This is the MOST LIKELY part you'll need to edit. # # - Keys: The column names as they appear EXACTLY in the Elvanto report table. # - Values: The new column names you want in your final CSV. # # Open the report link in your browser to see the exact column headers and update the keys below. # Note: Column names might be slightly different between yearly tables, so check them all. COLUMN_MAPPING = { 'Date': 'Date', 'Service': 'Service', 'Total Individuals': 'Total Individuals', # This might be 'Total Individual' or 'Sum' in your report 'Adults': 'Adults', 'Children': 'Children', 'Guests': 'Guests', # Elvanto reports often use 'Visitors' for guests. } # --- END OF CONFIGURATION --- def main(): """ Main function to fetch an Elvanto report page, scrape all data tables, and write the combined results to a CSV file. """ if "YOUR_REPORT_URL_HERE" in REPORT_URL: print("Error: Please update the 'REPORT_URL' in the script before running.") return print(f"Attempting to fetch data from your Elvanto report URL...") try: # Use headers to mimic a web browser, which can help with access. headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(REPORT_URL, headers=headers) response.raise_for_status() print("Successfully fetched the report page.") # pandas.read_html() intelligently searches for tags on the page # and returns a list of DataFrames, one for each table found. tables = pd.read_html(response.text) if not tables: print("\nError: No data tables were found on the report page.") print("Please check that the URL is correct and the report is loading properly in your browser.") return print(f"Found {len(tables)} data table(s) on the page. Combining them...") all_dataframes = [] for i, table_df in enumerate(tables): print(f"--- Processing table {i+1} of {len(tables)} ---") processed_df = table_df.copy() processed_df.rename(columns=COLUMN_MAPPING, inplace=True) required_columns = list(COLUMN_MAPPING.values()) final_columns = [col for col in required_columns if col in processed_df.columns] if not final_columns: print(f"Warning: No mappable columns found in table {i+1}. Skipping this table.") continue all_dataframes.append(processed_df[final_columns]) if not all_dataframes: print("\nError: After processing all tables, no valid data could be extracted.") print("Please double-check your 'COLUMN_MAPPING' configuration.") return # Concatenate all the processed dataframes into one df_final = pd.concat(all_dataframes, ignore_index=True) print("\nSuccessfully combined all tables.") # Data Cleaning: Remove summary rows like 'Total' or 'Average' df_final = df_final[~df_final.iloc[:, 0].astype(str).str.contains('Total|Average', na=False)] # Convert date column to the correct format if it exists if 'Date' in df_final.columns: try: df_final['Date'] = pd.to_datetime(df_final['Date']).dt.strftime('%Y-%m-%d %H:%M:%S') df_final.sort_values(by='Date', ascending=False, inplace=True) except Exception as e: print(f"\nWarning: Could not automatically format the 'Date' column. It may contain unexpected values. Error: {e}") df_final.to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) print(f"\nSuccess! Scraped {len(df_final)} records and saved them to '{OUTPUT_FILE}'.") print(f"Final columns in CSV: {df_final.columns.tolist()}") except requests.exceptions.RequestException as e: print(f"\nError: Failed to fetch the URL. Please check the REPORT_URL and your internet connection. Details: {e}") except Exception as e: print(f"\nAn unexpected error occurred during processing: {e}") if __name__ == "__main__": main()