From 8ceeafe5ce4db016727118826724d4494226f29d Mon Sep 17 00:00:00 2001 From: Alex Meadows Date: Sat, 22 Jun 2019 08:15:55 -0400 Subject: [PATCH] process_tracker_python-55 Read config file from s3 :sparkles: Settings files should now be able to be read from an s3 location Added ability for settings config files to be read from s3. This will be helpful when wiring process_tracker into Lambda or Glue functions. Closes #55 --- README.md | 1 + process_tracker/utilities/aws_utilities.py | 143 ++++++++++++ process_tracker/utilities/settings.py | 73 ++++++- tests/fixtures/process_tracker_config.ini | 8 + tests/test_process_tracker.py | 15 +- tests/utilities/test_aws_utilities.py | 243 +++++++++++++++++++++ tests/utilities/test_settings_manager.py | 74 +++++++ 7 files changed, 544 insertions(+), 13 deletions(-) create mode 100644 process_tracker/utilities/aws_utilities.py create mode 100644 tests/fixtures/process_tracker_config.ini create mode 100644 tests/utilities/test_aws_utilities.py diff --git a/README.md b/README.md index 4f84325..1c47d8b 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ProcessTracker - Python Data integration process management made easy! [![Coverage Status](https://coveralls.io/repos/github/OpenDataAlex/process_tracker_python/badge.svg?branch=master)](https://coveralls.io/github/OpenDataAlex/process_tracker_python?branch=master) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/5600be1990974a4688a4fa4852edb5d5)](https://www.codacy.com/app/alexmeadows_2718/process_tracker_python?utm_source=github.com&utm_medium=referral&utm_content=OpenDataAlex/process_tracker_python&utm_campaign=Badge_Grade) [![Build Status](https://travis-ci.org/OpenDataAlex/process_tracker_python.svg?branch=master)](https://travis-ci.org/OpenDataAlex/process_tracker_python) [![Downloads](https://pepy.tech/badge/processtracker)](https://pepy.tech/project/processtracker) [![PyPI version](https://badge.fury.io/py/processtracker.svg)](https://badge.fury.io/py/processtracker) diff --git a/process_tracker/utilities/aws_utilities.py b/process_tracker/utilities/aws_utilities.py new file mode 100644 index 0000000..c263a88 --- /dev/null +++ b/process_tracker/utilities/aws_utilities.py @@ -0,0 +1,143 @@ +# AWS Utilities +# Utilities for working with AWS services + +import logging + +import boto3 +from botocore.errorfactory import ClientError + + +class AwsUtilities: + def __init__(self): + + self.logger = logging.getLogger(__name__) + self.logger.setLevel("DEBUG") + + self.s3 = boto3.resource("s3") + + def determine_bucket_name(self, path): + """ + For the given path, return the bucket name, if path is a valid s3 URL. + :param path: Valid s3 URL. + :return: + """ + if self.determine_valid_s3_path(path=path): + self.logger.debug("Parsing %s" % path) + + if "s3://" in path: + path = path[path.startswith("s3://") and len("s3://") :] + + self.logger.debug("Path is now %s" % path) + + bucket_name = path.split("/")[0] + + self.logger.debug("Bucket name is %s" % bucket_name) + + elif "s3" in path and ".amazonaws.com" in path: + if path.startswith("http://"): + + path = path[len("http://") :] + + self.logger.debug("Path is now %s" % path) + + elif path.startswith("https://"): + + path = path[len("https://") :] + self.logger.debug("Path is now %s" % path) + + else: + error_msg = "It appears the URL is not valid. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + bucket_name = path.split(".")[0] + else: + error_msg = "It appears the URL is not a valid s3 path. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + return bucket_name + + def determine_file_key(self, path): + """ + Determine the key of the s3 file based on the filepath provided. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + + if "s3://" in path: + groups = path.split("/", 3) + + key = groups[3] + + elif "s3" in path and ".amazonaws.com" in path: + groups = path.split(".amazonaws.com/") + + key = groups[1] + + else: + error_msg = "It appears the URL is not valid. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + return key + + def determine_s3_file_exists(self, path): + """ + Determine if a file exists on s3 based on given path. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + self.logger.info("Determining if %s exists." % path) + + bucket_name = self.determine_bucket_name(path=path) + + key = self.determine_file_key(path=path) + + try: + self.s3.Object(bucket_name, key).load() + + return True + + except ClientError: + error_msg = "File %s does not exist in s3." % path + self.logger.error(error_msg) + + return False + + def determine_valid_s3_path(self, path): + """ + Take the provided path and determine if valid s3 URL. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + self.logger.debug("Validating %s" % path) + if "s3://" in path: + self.logger.debug("s3:// in path.") + return True + elif "s3" in path and ".amazonaws.com" in path: + self.logger.debug("s3 and .amazonaws.com in path") + return True + else: + self.logger.error("Path is invalid.") + return False + + def get_s3_bucket(self, bucket_name): + + return self.s3.Bucket(bucket_name) + + def read_from_s3(self, bucket_name, filename): + """ + With a given bucket and filename, read from s3. + :param bucket: + :param file: + :return: + """ + + bucket = self.get_s3_bucket(bucket_name=bucket_name) diff --git a/process_tracker/utilities/settings.py b/process_tracker/utilities/settings.py index b441415..d61cc74 100644 --- a/process_tracker/utilities/settings.py +++ b/process_tracker/utilities/settings.py @@ -1,8 +1,12 @@ # Settings manager and configuration, both for initialization and reading. import configparser +import logging import os from pathlib import Path +import tempfile + +from process_tracker.utilities.aws_utilities import AwsUtilities class SettingsManager: @@ -15,24 +19,57 @@ def __init__(self, config_location=None): self.config = configparser.ConfigParser(allow_no_value=True) + self.logger = logging.getLogger(__name__) + self.logger.setLevel("DEBUG") + + self.aws_utils = AwsUtilities() + + exists = False + if config_location is None: - home = str(Path.home()) - self.config_path = os.path.join(home, ".process_tracker/") - self.config_file = os.path.join( - self.config_path, "process_tracker_config.ini" + home = Path.home() + + self.config_path = str(home.joinpath(".process_tracker/")) + self.config_file = str( + Path(self.config_path).joinpath("process_tracker_config.ini") ) + exists = os.path.isfile(self.config_file) + else: self.config_path = config_location - self.config_file = os.path.join( - self.config_path, "process_tracker_config.ini" - ) - exists = os.path.isfile(self.config_file) + if "process_tracker_config.ini" not in self.config_path: + self.logger.debug( + "process_tracker_config.ini not present. Appending to %s" + % self.config_path + ) + + self.config_file = self.config_path + + if not self.config_file.endswith("/"): + self.config_file += "/" + + self.config_file += "process_tracker_config.ini" + + self.logger.debug("Config file is now %s" % self.config_file) + else: + self.logger.debug( + "process_tracker_config.ini present. Setting config_path to config_file." + ) + self.config_file = self.config_path + + if self.aws_utils.determine_valid_s3_path( + path=self.config_path + ) and self.aws_utils.determine_s3_file_exists(path=self.config_file): + + exists = True if exists: self.read_config_file() else: + # How to handle if exists is false and it's s3? + self.create_config_file() def create_config_file(self): @@ -62,4 +99,22 @@ def read_config_file(self): :return: """ - return self.config.read(self.config_file) + if self.aws_utils.determine_valid_s3_path( + path=self.config_path + ) and self.aws_utils.determine_s3_file_exists(path=self.config_file): + + temp_file = tempfile.NamedTemporaryFile() + bucket_name = self.aws_utils.determine_bucket_name(path=self.config_path) + + bucket = self.aws_utils.get_s3_bucket(bucket_name=bucket_name) + key = self.aws_utils.determine_file_key(path=self.config_file) + + bucket.download_file(key, temp_file.name) + + with open(temp_file.name, "r") as f: + self.config.readfp(f) + temp_file.close() + + else: + + return self.config.read(self.config_file) diff --git a/tests/fixtures/process_tracker_config.ini b/tests/fixtures/process_tracker_config.ini new file mode 100644 index 0000000..2a69119 --- /dev/null +++ b/tests/fixtures/process_tracker_config.ini @@ -0,0 +1,8 @@ +[DEFAULT] +log_level = DEBUG +data_store_type = postgresql +data_store_username = pt_admin_test +data_store_password = arglebargle +data_store_host = localhost +data_store_port = 1234 +data_store_name = process_tracker \ No newline at end of file diff --git a/tests/test_process_tracker.py b/tests/test_process_tracker.py index 47d9d30..b7261d6 100755 --- a/tests/test_process_tracker.py +++ b/tests/test_process_tracker.py @@ -1,6 +1,7 @@ # Tests for validating process_tracking works as expected. from datetime import datetime, timedelta +import logging import os from pathlib import Path import time @@ -29,7 +30,7 @@ ProcessTargetObject, ProcessTracking, ) -from process_tracker.models.source import Source, SourceObject +from process_tracker.models.source import Source from process_tracker.data_store import DataStore from process_tracker.extract_tracker import ExtractTracker @@ -43,6 +44,8 @@ class TestProcessTracker(unittest.TestCase): @classmethod def setUpClass(cls): + cls.logger = logging.Logger(__name__) + cls.data_store = DataStore() cls.session = cls.data_store.session cls.data_store_type = cls.data_store.data_store_type @@ -696,6 +699,10 @@ def test_register_extracts_by_location_local(self): self.assertCountEqual(expected_result, given_result) + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) @mock_s3 def test_register_extracts_by_location_s3(self): """ @@ -740,9 +747,9 @@ def test_register_extracts_by_location_s3(self): key = os.path.join(test_bucket, file) - print(file) - print(key) - print(fixtures_dir) + self.logger.debug("Filename %s" % file) + self.logger.debug("File key %s" % key) + self.logger.debug("Fixtures dir %s" % fixtures_dir) file = os.path.join(fixtures_dir, file) client.upload_file(Filename=file, Bucket=test_bucket, Key=key) diff --git a/tests/utilities/test_aws_utilities.py b/tests/utilities/test_aws_utilities.py new file mode 100644 index 0000000..ddad7a5 --- /dev/null +++ b/tests/utilities/test_aws_utilities.py @@ -0,0 +1,243 @@ +import logging +import unittest + +import boto3 +import botocore +from moto import mock_s3 +import os + +from process_tracker.utilities.aws_utilities import AwsUtilities + + +class TestAwsUtilities(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.logger = logging.Logger(__name__) + + cls.aws_util = AwsUtilities() + + def test_determine_bucket_name_valid_path_s3(self): + """ + If path provided is an AWS CLI url, parse and return the bucket name. + :return: + """ + path = "s3://test_bucket/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_valid_path_url_http(self): + """ + If path provided is an AWS http URL, parse and return the bucket name. + :return: + """ + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_valid_path_url_https(self): + """ + If path provided is an AWS https URL, parse and return the bucket name + :return: + """ + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_invalid_path(self): + """ + If path provided is an invalid s3 path, throw exception. + :return: + """ + path = "invalid.path/test_bucket/bucket_file.csv" + + with self.assertRaises(Exception) as context: + + self.aws_util.determine_bucket_name(path=path) + + return self.assertTrue( + "It appears the URL is not a valid s3 path. invalid.path/test_bucket/bucket_file.csv" + in str(context.exception) + ) + + def test_determine_file_key_valid_path_s3(self): + """ + If path provided is an AWS CLI url, parse and return the object key. + :return: + """ + path = "s3://test_bucket/folder/bucket_file.csv" + + expected_result = "folder/bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_valid_path_http(self): + """ + If path provided is an AWS http URL, parse and return the object key. + :return: + """ + + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_valid_path_https(self): + """ + If path provided is an AWS https URL, parse and return the object key. + :return: + """ + + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_invalid_path(self): + """ + If path provided is an invalid URL, throw an exception. + :return: + """ + + path = "invalid.path/test_bucket/bucket_file.csv" + + with self.assertRaises(Exception) as context: + + self.aws_util.determine_file_key(path=path) + + return self.assertTrue( + "It appears the URL is not valid. invalid.path/test_bucket/bucket_file.csv" + in str(context.exception) + ) + + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) + @mock_s3 + def test_determine_s3_file_exists_valid_file(self): + """ + If path provided is valid, determine if file exists or not. + :return: + """ + expected_keys = ["test_local_dir_1.csv"] + test_bucket = "test_bucket" + + path = "s3://test_bucket/test_local_dir_1.csv" + + client = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + + try: + s3 = boto3.resource( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + s3.meta.client.head_bucket(Bucket=test_bucket) + + except botocore.exceptions.ClientError: + pass + else: + err = "%s should not exist" % test_bucket + raise EnvironmentError(err) + + client.create_bucket(Bucket=test_bucket) + + current_dir = os.path.join(os.path.dirname(__file__), "..") + fixtures_dir = os.path.join(current_dir, "fixtures") + + for file in expected_keys: + key = file + self.logger.debug("Filename %s" % file) + self.logger.debug("Fixtures dir %s" % fixtures_dir) + + file = os.path.join(fixtures_dir, file) + client.upload_file(Filename=file, Bucket=test_bucket, Key=key) + + given_result = self.aws_util.determine_s3_file_exists(path=path) + expected_result = True + + self.assertEqual(expected_result, given_result) + + @mock_s3 + def test_determine_s3_file_exists_invalid_file(self): + """ + If path provided, but file does not exist is s3, throw ClientError. + :return: + """ + + path = "s3://test_bucket/test_local_dir_1.csv" + + given_result = self.aws_util.determine_s3_file_exists(path=path) + expected_result = False + + return self.assertEqual(expected_result, given_result) + + def test_determine_valid_s3_path_valid_path_s3(self): + """ + Testing that if path is an AWS CLI URL, that the path is validated. + :return: + """ + path = "s3://test_bucket/folder/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_valid_path_http(self): + """ + Testing that if a path is a valid http AWS S3 URL, that the path is validated. + :return: + """ + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_valid_path_https(self): + """ + Testing that if a path is a valid https AWS S3 URL, that the path is validated. + :return: + """ + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_invalid_path(self): + """ + Testing that if a path is not a valid URL, that the path is not validated. + :return: + """ + path = "argle.bargle/test_bucket/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertFalse(given_result) diff --git a/tests/utilities/test_settings_manager.py b/tests/utilities/test_settings_manager.py index b63f271..941374d 100644 --- a/tests/utilities/test_settings_manager.py +++ b/tests/utilities/test_settings_manager.py @@ -1,7 +1,13 @@ import configparser +import os +from pathlib import Path import shutil import unittest +import boto3 +import botocore +from moto import mock_s3 + from process_tracker.utilities.settings import SettingsManager @@ -25,6 +31,19 @@ def test_config_location_set(self): self.assertEqual(expected_result, given_result) + def test_config_location_s3(self): + """ + Testing that if config_location is set and the path is an s3 file/location, use that instead of the home + directory. + :return: + """ + + expected_result = "s3://test_bucket/process_tracker_config.ini" + + given_result = SettingsManager(config_location="s3://test_bucket/").config_file + + self.assertEqual(expected_result, given_result) + def test_create_config_file(self): """ Testing that if the config file does not exist, it is created. @@ -37,3 +56,58 @@ def test_create_config_file(self): expected_result = "None" self.assertEqual(expected_result, given_result) + + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) + @mock_s3 + def test_read_config_file_s3(self): + """ + Testing that if config file is on s3 then the file is pulled down and read. + :return: + """ + expected_keys = ["process_tracker_config.ini"] + test_bucket = "test_bucket" + + path = "s3://test_bucket/process_tracker_config.ini" + + client = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + + try: + s3 = boto3.resource( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + s3.meta.client.head_bucket(Bucket=test_bucket) + + except botocore.exceptions.ClientError: + pass + else: + err = "%s should not exist" % test_bucket + raise EnvironmentError(err) + + client.create_bucket(Bucket=test_bucket) + + current_dir = os.path.join(os.path.dirname(__file__), "..") + fixtures_dir = os.path.join(current_dir, "fixtures") + + for file in expected_keys: + key = file + file = os.path.join(fixtures_dir, file) + client.upload_file(Filename=file, Bucket=test_bucket, Key=key) + + settings = SettingsManager(config_location=path).config + + given_result = settings["DEFAULT"]["data_store_username"] + + expected_result = "pt_admin_test" + + self.assertEqual(expected_result, given_result)