python-toolbox/file-tools/s3_sync.py

156 lines
No EOL
6 KiB
Python

import os
import argparse
import boto3
from botocore.exceptions import NoCredentialsError, ClientError
import logging
from datetime import datetime, timezone
# --- Configuration ---
# Set up logging to provide clear output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def get_s3_client():
"""
Initializes and returns a Boto3 S3 client.
Handles credential errors gracefully.
"""
try:
# Boto3 will automatically look for credentials in the standard locations:
# 1. Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
# 2. The ~/.aws/credentials file
s3_client = boto3.client('s3')
# A quick check to ensure credentials are valid
s3_client.list_buckets()
return s3_client
except NoCredentialsError:
logging.error("AWS credentials not found. Please configure them using 'aws configure' or environment variables.")
return None
except ClientError as e:
if e.response['Error']['Code'] == 'InvalidAccessKeyId':
logging.error("Invalid AWS Access Key ID. Please check your credentials.")
else:
logging.error(f"An AWS client error occurred: {e}")
return None
except Exception as e:
logging.error(f"An unexpected error occurred during S3 client initialization: {e}")
return None
def get_s3_objects(s3_client, bucket_name):
"""
Fetches all objects in the S3 bucket and returns a dictionary
mapping object keys to their last modified timestamps.
"""
s3_objects = {}
try:
# Use a paginator to handle buckets with more than 1000 objects
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name)
for page in pages:
if "Contents" in page:
for obj in page['Contents']:
s3_objects[obj['Key']] = obj['LastModified']
except ClientError as e:
if e.response['Error']['Code'] == 'NoSuchBucket':
logging.error(f"The bucket '{bucket_name}' does not exist.")
else:
logging.error(f"Could not list objects in bucket '{bucket_name}': {e}")
return None
return s3_objects
def sync_folder_to_s3(s3_client, local_folder, bucket_name, delete_extra_files):
"""
Syncs the contents of a local folder to an S3 bucket.
"""
if not os.path.isdir(local_folder):
logging.error(f"Local directory not found: {local_folder}")
return
logging.info(f"Starting sync from '{local_folder}' to S3 bucket '{bucket_name}'...")
s3_objects = get_s3_objects(s3_client, bucket_name)
if s3_objects is None:
logging.error("Aborting sync due to S3 error.")
return
local_files = set()
upload_count = 0
skip_count = 0
# --- Step 1: Walk local directory and upload new/modified files ---
for root, _, files in os.walk(local_folder):
for filename in files:
local_path = os.path.join(root, filename)
# Create the relative path to use as the S3 object key
relative_path = os.path.relpath(local_path, local_folder)
# S3 uses forward slashes, so convert for cross-platform compatibility
s3_key = relative_path.replace(os.path.sep, '/')
local_files.add(s3_key)
local_mtime_dt = datetime.fromtimestamp(os.path.getmtime(local_path), tz=timezone.utc)
# Check if file needs to be uploaded
if s3_key not in s3_objects or local_mtime_dt > s3_objects[s3_key]:
try:
logging.info(f"Uploading: {s3_key}")
s3_client.upload_file(local_path, bucket_name, s3_key)
upload_count += 1
except ClientError as e:
logging.error(f"Failed to upload {local_path}: {e}")
else:
logging.debug(f"Skipping (unchanged): {s3_key}")
skip_count += 1
logging.info("Local file scan complete.")
# --- Step 2: Delete files from S3 that are not present locally (if enabled) ---
delete_count = 0
if delete_extra_files:
logging.info("Checking for files to delete from S3...")
s3_keys_to_delete = [
{'Key': key} for key in s3_objects if key not in local_files
]
if s3_keys_to_delete:
# S3 delete_objects can handle up to 1000 keys at a time
for i in range(0, len(s3_keys_to_delete), 1000):
chunk = s3_keys_to_delete[i:i + 1000]
try:
logging.info(f"Deleting {len(chunk)} files from S3...")
s3_client.delete_objects(
Bucket=bucket_name,
Delete={'Objects': chunk}
)
delete_count += len(chunk)
except ClientError as e:
logging.error(f"Failed to delete objects from S3: {e}")
else:
logging.info("No files to delete from S3.")
# --- Final Summary ---
logging.info("="*30)
logging.info("Sync Summary")
logging.info(f" - Uploaded: {upload_count} files")
logging.info(f" - Skipped: {skip_count} files (up-to-date)")
if delete_extra_files:
logging.info(f" - Deleted: {delete_count} files from S3")
logging.info("Sync complete.")
logging.info("="*30)
if __name__ == "__main__":
# --- Command-Line Argument Parsing ---
parser = argparse.ArgumentParser(description="Sync a local folder to an Amazon S3 bucket.")
parser.add_argument("local_folder", help="The local folder to sync.")
parser.add_argument("bucket_name", help="The name of the S3 bucket.")
parser.add_argument(
"--delete",
action="store_true",
help="Delete files from the S3 bucket that do not exist in the local folder."
)
args = parser.parse_args()
s3_client = get_s3_client()
if s3_client:
sync_folder_to_s3(s3_client, args.local_folder, args.bucket_name, args.delete)