Nan's Blog: S3

Showing posts with label S3. Show all posts

Tuesday, October 17, 2023

Python - get folder size from s3 bucket

import boto3

bucket_name = 'xxxxxxx'

def get_size(bucket, path):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    total_size = 0

    for obj in my_bucket.objects.filter(Prefix=path):
        total_size = total_size + obj.size

    return total_size/1024/1024

def list_folders(s3_client, bucket_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix='', Delimiter='/')
    for content in response.get("CommonPrefixes", []):
        yield content.get('Prefix')

s3_client = boto3.client('s3')
folder_list = list_folders(s3_client, bucket_name)
for folder in folder_list:
    print('Folder found: %s' % folder)
    print(folder, get_size(bucket_name, folder))

Thursday, January 5, 2023

Lambda - delete and copy files from S3 sub-folders

import boto3
import json
import logging
from botocore.exceptions import ClientError

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')


def lambda_handler(event, context):
    print(event)

    prd_bucket = event.get('prd_bucket')
    prd_prefix = event.get('prd_prefix')
    prd_folder_to_copy = event.get('prd_folder_to_copy')
    stg_bucket = event.get('stg_bucket')
    stg_prefix = event.get('stg_prefix')
    stg_folder_to_copy = prd_folder_to_copy

    def list_bucket_objects(bucket_name, prefix_name):
        # Retrieve the list of bucket objects
        try:
            response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix_name)
        except ClientError as e:
            logging.error(e)
            return None
        return response['Contents']

    logging.info(f'delete existing test data from {stg_bucket}...')
    stg_objects = list_bucket_objects(stg_bucket, stg_prefix)
    if stg_objects:
        for obj in stg_objects:
            file_key = obj['Key']
            if len(stg_folder_to_copy) > 1 and stg_folder_to_copy in file_key:
                to_delete = [{'Key': file_key}]
                logging.info(f"delete existing test data {obj['Key']}...")
                if to_delete:
                    s3_client.delete_objects(Bucket=stg_bucket, Delete={'Objects': to_delete})

    logging.info(f"Copy new test data from {prd_bucket} to {stg_bucket}...")
    objects = list_bucket_objects(prd_bucket, prd_prefix)
    if objects:
        for obj in objects:
            logging.info("file_location:", obj['Key'])
            if prd_folder_to_copy in obj['Key']:
                file_name = obj['Key'].split('/')[-1]
                dest = s3.Bucket(stg_bucket)
                source = {'Bucket': prd_bucket, 'Key': obj['Key']}
                dest.copy(source, stg_prefix + stg_folder_to_copy + file_name)

    return {
        'statusCode': 200,
        'body': json.dumps('Success!')
    }

Tuesday, June 14, 2022

Python - Upload file to S3 using pre-signed URL

import requests

# API endpoint
api_url = "xxxxxx"

reports = "test/"
file_name = "test.csv"
OBJECT_NAME_TO_UPLOAD = "test.csv"

params = {"action": "upload",
          "file_key": reports + file_name,
          "bucket_name": "xxxx"
          }

headers = {"x-api-key": "xxxx"}

session = requests.session()
# Generate pre-signed-url
response = session.get(api_url, headers=headers, json=params)
res = response.json()

print(res)


# # # # # Upload file to S3 using pre-signed URL
with open(OBJECT_NAME_TO_UPLOAD) as f:

    upload_response = session.request('PUT', res['url'], data=f.read().encode('utf-8'))


print(f"Upload response: {upload_response.status_code}")

Thursday, March 10, 2022

Python - upload file to S3 using boto3

Get S3 Session

import boto3



def _get_s3_session():
    envvars = subprocess.check_output(
        ['aws-vault', 'exec', 'your-aws-role', '--', 'env'])

    aws_access_key_id = ''
    aws_secret_access_key = ''
    aws_session_token = ''
    for envline in envvars.split(b'\n'):
        line = envline.decode('utf8')
        eqpos = line.find('=')
        if eqpos < 4:
            continue
        k = line[0:eqpos]
        v = line[eqpos + 1:]
        if k == 'AWS_ACCESS_KEY_ID':
            aws_access_key_id = v
        if k == 'AWS_SECRET_ACCESS_KEY':
            aws_secret_access_key = v
        if k == 'AWS_SESSION_TOKEN':
            aws_session_token = v

    session = boto3.Session(aws_access_key_id, aws_secret_access_key, aws_session_token)
    return session

Upload files to S3

class ProgressPercentage(object):
    def __init__(self, filename, size=None, prefix_str=''):
        self._filename = filename
        if size is None:
            self._size = float(os.path.getsize(filename))
        else:
            self._size = size
        self._prefix_str = prefix_str
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify we'll assume this is hooked up
        # to a single filename.
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s %s / %s  (%.2f%%)" % (
                    self._prefix_str,
                    self._seen_so_far,
                    self._size,
                    percentage))
            sys.stdout.flush()

def _upload_to_s3(self, **kwargs):
    start = time.time()

    self._object_name = kwargs['object_name']
    self._file_path = kwargs['path']
    self._bucket_name = kwargs['bucket_name']

    # If S3 object_name was not specified, use file_name
    if self._object_name is None:
        self._object_name = ntpath.basename(self._file_path )
    # Upload the file
    session = _get_s3_session()
    s3_client = session.client("s3")

    try:
        # Perform the transfer
        s3_client.upload_file(
            self._file_path,
            self._bucket_name,
            self._object_name,
            Callback=ProgressPercentage(
                self._file_path,
                prefix_str='Uploading file {:} -> {:}.{:}: '.format(
                    os.path.dirname(self._file_path), self._bucket_name, self._object_name)),
        )

Wednesday, October 7, 2020

PySpark - Read Shape file from S3 and Mount S3 as File System

Advantages of Mounting Amazon S3 as a File System

Mounting an Amazon S3 bucket as a file system means that you can use all your existing tools
and applications to interact with the Amazon S3 bucket to perform read/write operations on
files and folders. Can EC2 mount Amazon S3? Using this method enables multiple Amazon
EC2 instances to concurrently mount and access data in Amazon S3, just like a shared file
system.

Why use an Amazon S3 file system? Any application interacting with the mounted drive
doesn’t have to worry about transfer protocols, security mechanisms, or Amazon
S3-specific API calls. In some cases, mounting Amazon S3 as drive on an application
server can make creating a distributed file store extremely easy.

For example, when creating a photo upload application, you can have it store data on a fixed
path in a file system and when deploying you can mount an Amazon S3 bucket on that fixed
path. This way, the application will write all files in the bucket without you having to worry about
Amazon S3 integration at the application level. Another major advantage is to enable legacy
applications to scale in the cloud since there are no source code changes required to use an
Amazon S3 bucket as storage backend: the application can be configured to use a local path
where the Amazon S3 bucket is mounted. This technique is also very helpful when you want
to collect logs from various servers in a central location for archiving.

After mounting S3 as local file system, you can use Pandas or others to access file using
path like:

Location = geopandas.read_file("/dbfs/mnt/bucket-name/geofactor/data/shapefilename.shp")

But in my case, since we have limitations on mounting S3 as well as permission issues
(only Spark can ready from S3 bucket), but the file type is shape file which includes .dbf, .prj,
.shp and .shx type files and we have to read them as a whole. so I zipped the file. So basically,
we cannot use Spark to read this zip file.

We have to use boto3 to read the zip file. And the work around it instead of mounting S3 is to
read the zip file using BytesIO().
buffer = BytesIO(zip_obj.get()["Body"].read())
zipfile = ZipFile(io.BytesIO(buffer.read()))

Wednesday, July 15, 2020

Python / S3 - Functions to list keys in an S3 bucket using Python

Reference : https://alexwlchan.net/2017/07/listing-s3-keys/

from glob import glob
import boto3


class Versions:
    def __init__(self):
        """Gets the latest version from local or s3"""

         pass

    def get_latest_version_from_local(path):
        """Gets the latest version from Local"""

        versions_paths = glob((path + "/*"), recursive=True)
        versions = []
        for i in enumerate(versions_paths):
            split_path = i[1].rstrip('/').split("/")
            version = split_path.pop()
            versions.append(version)
        versions.sort(reverse=True)
        return versions[0]

    def get_latest_version_from_s3(bucket_name, path):
        """Gets the latest version from s3 """

        key = path.rstrip('/').split("/").pop()
        s3 = boto3.client('s3')
        response = s3.list_objects_v2(
            Bucket=bucket_name,

            Prefix=key,

            MaxKeys=100)
        versions = []
        for obj in response['Contents']:
            split_path = obj['Key'].rstrip('/').split("/")
            versions.append(split_path[1])
        versions.sort(reverse=True)
        return versions[0]

    def get_all_s3_keys(bucket_name):
        """Get a list of all keys in an S3 bucket."""

        versions = []

        kwargs = {'Bucket': bucket_name}
        s3 = boto3.client('s3')
        while True:
            resp = s3.list_objects_v2(**kwargs)
            for obj in resp['Contents']:
                if 'well_production' in str(obj['Key'])

                    and '$folder$' not in str(obj['Key']):
                    versions.append(obj['Key'])
            try:
                kwargs['ContinuationToken'] = resp['NextContinuationToken']
            except KeyError:
                break

        versions.sort(reverse=True)
        return versions[0]

Nan's Blog