111 lines
No EOL
4.7 KiB
Python
111 lines
No EOL
4.7 KiB
Python
import requests
|
|
import pandas as pd
|
|
import csv
|
|
|
|
# --- CONFIGURATION ---
|
|
# IMPORTANT: Replace this with the full URL for your Elvanto report,
|
|
# including the authkey.
|
|
REPORT_URL = "https://cairnspc.elvanto.com.au/report/?id=c5ab005a-f2be-403e-84a5-85870ac4a41b&authkey=lguYFfS9"
|
|
|
|
# The name of the file where the data will be saved.
|
|
OUTPUT_FILE = "scraped_elvanto_report.csv"
|
|
|
|
# This mapping helps rename columns from the Elvanto HTML report
|
|
# to the desired CSV column names. This is the MOST LIKELY part you'll need to edit.
|
|
#
|
|
# - Keys: The column names as they appear EXACTLY in the Elvanto report table.
|
|
# - Values: The new column names you want in your final CSV.
|
|
#
|
|
# Open the report link in your browser to see the exact column headers and update the keys below.
|
|
# Note: Column names might be slightly different between yearly tables, so check them all.
|
|
COLUMN_MAPPING = {
|
|
'Date': 'Date',
|
|
'Service': 'Service',
|
|
'Total Individuals': 'Total Individuals', # This might be 'Total Individual' or 'Sum' in your report
|
|
'Adults': 'Adults',
|
|
'Children': 'Children',
|
|
'Guests': 'Guests', # Elvanto reports often use 'Visitors' for guests.
|
|
}
|
|
# --- END OF CONFIGURATION ---
|
|
|
|
def main():
|
|
"""
|
|
Main function to fetch an Elvanto report page, scrape all data tables,
|
|
and write the combined results to a CSV file.
|
|
"""
|
|
if "YOUR_REPORT_URL_HERE" in REPORT_URL:
|
|
print("Error: Please update the 'REPORT_URL' in the script before running.")
|
|
return
|
|
|
|
print(f"Attempting to fetch data from your Elvanto report URL...")
|
|
|
|
try:
|
|
# Use headers to mimic a web browser, which can help with access.
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
response = requests.get(REPORT_URL, headers=headers)
|
|
response.raise_for_status()
|
|
print("Successfully fetched the report page.")
|
|
|
|
# pandas.read_html() intelligently searches for <table> tags on the page
|
|
# and returns a list of DataFrames, one for each table found.
|
|
tables = pd.read_html(response.text)
|
|
|
|
if not tables:
|
|
print("\nError: No data tables were found on the report page.")
|
|
print("Please check that the URL is correct and the report is loading properly in your browser.")
|
|
return
|
|
|
|
print(f"Found {len(tables)} data table(s) on the page. Combining them...")
|
|
|
|
all_dataframes = []
|
|
|
|
for i, table_df in enumerate(tables):
|
|
print(f"--- Processing table {i+1} of {len(tables)} ---")
|
|
|
|
processed_df = table_df.copy()
|
|
processed_df.rename(columns=COLUMN_MAPPING, inplace=True)
|
|
|
|
required_columns = list(COLUMN_MAPPING.values())
|
|
final_columns = [col for col in required_columns if col in processed_df.columns]
|
|
|
|
if not final_columns:
|
|
print(f"Warning: No mappable columns found in table {i+1}. Skipping this table.")
|
|
continue
|
|
|
|
all_dataframes.append(processed_df[final_columns])
|
|
|
|
if not all_dataframes:
|
|
print("\nError: After processing all tables, no valid data could be extracted.")
|
|
print("Please double-check your 'COLUMN_MAPPING' configuration.")
|
|
return
|
|
|
|
# Concatenate all the processed dataframes into one
|
|
df_final = pd.concat(all_dataframes, ignore_index=True)
|
|
print("\nSuccessfully combined all tables.")
|
|
|
|
# Data Cleaning: Remove summary rows like 'Total' or 'Average'
|
|
df_final = df_final[~df_final.iloc[:, 0].astype(str).str.contains('Total|Average', na=False)]
|
|
|
|
# Convert date column to the correct format if it exists
|
|
if 'Date' in df_final.columns:
|
|
try:
|
|
df_final['Date'] = pd.to_datetime(df_final['Date']).dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
df_final.sort_values(by='Date', ascending=False, inplace=True)
|
|
except Exception as e:
|
|
print(f"\nWarning: Could not automatically format the 'Date' column. It may contain unexpected values. Error: {e}")
|
|
|
|
df_final.to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL)
|
|
|
|
print(f"\nSuccess! Scraped {len(df_final)} records and saved them to '{OUTPUT_FILE}'.")
|
|
print(f"Final columns in CSV: {df_final.columns.tolist()}")
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"\nError: Failed to fetch the URL. Please check the REPORT_URL and your internet connection. Details: {e}")
|
|
except Exception as e:
|
|
print(f"\nAn unexpected error occurred during processing: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |