Showing posts with label S3. Show all posts
Showing posts with label S3. Show all posts

Tuesday, October 17, 2023

Python - get folder size from s3 bucket

 import boto3

bucket_name = 'xxxxxxx'

def get_size(bucket, path):
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket)
total_size = 0

for obj in my_bucket.objects.filter(Prefix=path):
total_size = total_size + obj.size

return total_size/1024/1024

def list_folders(s3_client, bucket_name):
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix='', Delimiter='/')
for content in response.get("CommonPrefixes", []):
yield content.get('Prefix')

s3_client = boto3.client('s3')
folder_list = list_folders(s3_client, bucket_name)
for folder in folder_list:
print('Folder found: %s' % folder)
print(folder, get_size(bucket_name, folder))

Thursday, January 5, 2023

Lambda - delete and copy files from S3 sub-folders

import boto3
import json
import logging
from botocore.exceptions import ClientError

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')


def lambda_handler(event, context):
print(event)

prd_bucket = event.get('prd_bucket')
prd_prefix = event.get('prd_prefix')
prd_folder_to_copy = event.get('prd_folder_to_copy')
stg_bucket = event.get('stg_bucket')
stg_prefix = event.get('stg_prefix')
stg_folder_to_copy = prd_folder_to_copy

def list_bucket_objects(bucket_name, prefix_name):
# Retrieve the list of bucket objects
try:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix_name)
except ClientError as e:
logging.error(e)
return None
return response['Contents']

logging.info(f'delete existing test data from {stg_bucket}...')
stg_objects = list_bucket_objects(stg_bucket, stg_prefix)
if stg_objects:
for obj in stg_objects:
file_key = obj['Key']
if len(stg_folder_to_copy) > 1 and stg_folder_to_copy in file_key:
to_delete = [{'Key': file_key}]
logging.info(f"delete existing test data {obj['Key']}...")
if to_delete:
s3_client.delete_objects(Bucket=stg_bucket, Delete={'Objects': to_delete})

logging.info(f"Copy new test data from {prd_bucket} to {stg_bucket}...")
objects = list_bucket_objects(prd_bucket, prd_prefix)
if objects:
for obj in objects:
logging.info("file_location:", obj['Key'])
if prd_folder_to_copy in obj['Key']:
file_name = obj['Key'].split('/')[-1]
dest = s3.Bucket(stg_bucket)
source = {'Bucket': prd_bucket, 'Key': obj['Key']}
dest.copy(source, stg_prefix + stg_folder_to_copy + file_name)

return {
'statusCode': 200,
'body': json.dumps('Success!')
}

Tuesday, June 14, 2022

Python - Upload file to S3 using pre-signed URL

 

import requests

# API endpoint
api_url = "xxxxxx"

reports = "test/"
file_name = "test.csv"
OBJECT_NAME_TO_UPLOAD = "test.csv"

params = {"action": "upload",
"file_key": reports + file_name,
"bucket_name": "xxxx"
}

headers = {"x-api-key": "xxxx"}

session = requests.session()
# Generate pre-signed-url
response = session.get(api_url, headers=headers, json=params)
res = response.json()

print(res)


# # # # # Upload file to S3 using pre-signed URL
with open(OBJECT_NAME_TO_UPLOAD) as f:

upload_response = session.request('PUT', res['url'], data=f.read().encode('utf-8'))


print(f"Upload response: {upload_response.status_code}")

Thursday, March 10, 2022

Python - upload file to S3 using boto3

Get S3 Session


import boto3


def _get_s3_session():
envvars = subprocess.check_output(
['aws-vault', 'exec', 'your-aws-role', '--', 'env'])

aws_access_key_id = ''
aws_secret_access_key = ''
aws_session_token = ''
for envline in envvars.split(b'\n'):
line = envline.decode('utf8')
eqpos = line.find('=')
if eqpos < 4:
continue
k = line[0:eqpos]
v = line[eqpos + 1:]
if k == 'AWS_ACCESS_KEY_ID':
aws_access_key_id = v
if k == 'AWS_SECRET_ACCESS_KEY':
aws_secret_access_key = v
if k == 'AWS_SESSION_TOKEN':
aws_session_token = v

session = boto3.Session(aws_access_key_id, aws_secret_access_key, aws_session_token)
return session

Upload files to S3

class ProgressPercentage(object):
def __init__(self, filename, size=None, prefix_str=''):
self._filename = filename
if size is None:
self._size = float(os.path.getsize(filename))
else:
self._size = size
self._prefix_str = prefix_str
self._seen_so_far = 0
self._lock = threading.Lock()

def __call__(self, bytes_amount):
# To simplify we'll assume this is hooked up
# to a single filename.
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%)" % (
self._prefix_str,
self._seen_so_far,
self._size,
percentage))
sys.stdout.flush()
def _upload_to_s3(self, **kwargs):
start = time.time()

self._object_name = kwargs['object_name']
self._file_path = kwargs['path']
self._bucket_name = kwargs['bucket_name']

# If S3 object_name was not specified, use file_name
if self._object_name is None:
self._object_name = ntpath.basename(self._file_path )
# Upload the file
session = _get_s3_session()
s3_client = session.client("s3")

try:
# Perform the transfer
s3_client.upload_file(
self._file_path,
self._bucket_name,
self._object_name,
Callback=ProgressPercentage(
self._file_path,
prefix_str='Uploading file {:} -> {:}.{:}: '.format(
os.path.dirname(self._file_path), self._bucket_name, self._object_name)),
)


Wednesday, October 7, 2020

PySpark - Read Shape file from S3 and Mount S3 as File System


Advantages of Mounting Amazon S3 as a File System

Mounting an Amazon S3 bucket as a file system means that you can use all your existing tools 
and applications to interact with the Amazon S3 bucket to perform read/write operations on 
files and folders. Can EC2 mount Amazon S3? Using this method enables multiple Amazon 
EC2 instances to concurrently mount and access data in Amazon S3, just like a shared file 
system.
Why use an Amazon S3 file system? Any application interacting with the mounted drive 
doesn’t have to worry about transfer protocols, security mechanisms, or Amazon 
S3-specific API calls. In some cases, mounting Amazon S3 as drive on an application 
server can make creating a distributed file store extremely easy.
For example, when creating a photo upload application, you can have it store data on a fixed 
path in a file system and when deploying you can mount an Amazon S3 bucket on that fixed 
path. This way, the application will write all files in the bucket without you having to worry about
 Amazon S3 integration at the application level. Another major advantage is to enable legacy 
applications to scale in the cloud since there are no source code changes required to use an 
Amazon S3 bucket as storage backend: the application can be configured to use a local path 
where the Amazon S3 bucket is mounted. This technique is also very helpful when you want 
to collect logs from various servers in a central location for archiving.
After mounting S3 as local file system, you can use Pandas or others to access file using 
path like: 
Location = geopandas.read_file("/dbfs/mnt/bucket-name/geofactor/data/shapefilename.shp")
But in my case, since we have limitations on mounting S3 as well as permission issues 
(only Spark can ready from S3 bucket), but the file type is shape file which includes .dbf, .prj, 
.shp and .shx type files and we have to read them as a whole. so I zipped the file. So basically, 
we cannot use Spark to read this zip file. 

We have to use boto3 to read the zip file. And the work around it instead of mounting S3 is to 
read the zip file using BytesIO().
buffer = BytesIO(zip_obj.get()["Body"].read())
zipfile = ZipFile(io.BytesIO(buffer.read()))





Wednesday, July 15, 2020

Python / S3 - Functions to list keys in an S3 bucket using Python


from glob import glob
import boto3


class Versions:
    def __init__(self):
        """Gets the latest version from local or s3"""        
         pass
    def get_latest_version_from_local(path):
        """Gets the latest version from Local"""        
        versions_paths = glob((path + "/*"), recursive=True)
        versions = []
        for i in enumerate(versions_paths):
            split_path = i[1].rstrip('/').split("/")
            version = split_path.pop()
            versions.append(version)
        versions.sort(reverse=True)
        return versions[0]
    def get_latest_version_from_s3(bucket_name, path):
        """Gets the latest version from s3 """       
        key = path.rstrip('/').split("/").pop()
        s3 = boto3.client('s3')
        response = s3.list_objects_v2(
            Bucket=bucket_name,            
            Prefix=key,            
            MaxKeys=100)
        versions = []
        for obj in response['Contents']:
            split_path = obj['Key'].rstrip('/').split("/")
            versions.append(split_path[1])
        versions.sort(reverse=True)
        return versions[0]
 
    def get_all_s3_keys(bucket_name):
        """Get a list of all keys in an S3 bucket."""        
        versions = []

        kwargs = {'Bucket': bucket_name}
        s3 = boto3.client('s3')
        while True:
            resp = s3.list_objects_v2(**kwargs)
            for obj in resp['Contents']:
                if 'well_production' in str(obj['Key']) 
                    and '$folder$' not in str(obj['Key']):
                    versions.append(obj['Key'])
            try:
                kwargs['ContinuationToken'] = resp['NextContinuationToken']
            except KeyError:
                break        
        versions.sort(reverse=True)
        return versions[0]