diff --git a/README.md b/README.md index 4f84325..1c47d8b 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ProcessTracker - Python Data integration process management made easy! [![Coverage Status](https://coveralls.io/repos/github/OpenDataAlex/process_tracker_python/badge.svg?branch=master)](https://coveralls.io/github/OpenDataAlex/process_tracker_python?branch=master) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/5600be1990974a4688a4fa4852edb5d5)](https://www.codacy.com/app/alexmeadows_2718/process_tracker_python?utm_source=github.com&utm_medium=referral&utm_content=OpenDataAlex/process_tracker_python&utm_campaign=Badge_Grade) [![Build Status](https://travis-ci.org/OpenDataAlex/process_tracker_python.svg?branch=master)](https://travis-ci.org/OpenDataAlex/process_tracker_python) [![Downloads](https://pepy.tech/badge/processtracker)](https://pepy.tech/project/processtracker) [![PyPI version](https://badge.fury.io/py/processtracker.svg)](https://badge.fury.io/py/processtracker) diff --git a/process_tracker/utilities/aws_utilities.py b/process_tracker/utilities/aws_utilities.py new file mode 100644 index 0000000..c263a88 --- /dev/null +++ b/process_tracker/utilities/aws_utilities.py @@ -0,0 +1,143 @@ +# AWS Utilities +# Utilities for working with AWS services + +import logging + +import boto3 +from botocore.errorfactory import ClientError + + +class AwsUtilities: + def __init__(self): + + self.logger = logging.getLogger(__name__) + self.logger.setLevel("DEBUG") + + self.s3 = boto3.resource("s3") + + def determine_bucket_name(self, path): + """ + For the given path, return the bucket name, if path is a valid s3 URL. + :param path: Valid s3 URL. + :return: + """ + if self.determine_valid_s3_path(path=path): + self.logger.debug("Parsing %s" % path) + + if "s3://" in path: + path = path[path.startswith("s3://") and len("s3://") :] + + self.logger.debug("Path is now %s" % path) + + bucket_name = path.split("/")[0] + + self.logger.debug("Bucket name is %s" % bucket_name) + + elif "s3" in path and ".amazonaws.com" in path: + if path.startswith("http://"): + + path = path[len("http://") :] + + self.logger.debug("Path is now %s" % path) + + elif path.startswith("https://"): + + path = path[len("https://") :] + self.logger.debug("Path is now %s" % path) + + else: + error_msg = "It appears the URL is not valid. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + bucket_name = path.split(".")[0] + else: + error_msg = "It appears the URL is not a valid s3 path. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + return bucket_name + + def determine_file_key(self, path): + """ + Determine the key of the s3 file based on the filepath provided. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + + if "s3://" in path: + groups = path.split("/", 3) + + key = groups[3] + + elif "s3" in path and ".amazonaws.com" in path: + groups = path.split(".amazonaws.com/") + + key = groups[1] + + else: + error_msg = "It appears the URL is not valid. %s" % path + + self.logger.error(error_msg) + raise Exception(error_msg) + + return key + + def determine_s3_file_exists(self, path): + """ + Determine if a file exists on s3 based on given path. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + self.logger.info("Determining if %s exists." % path) + + bucket_name = self.determine_bucket_name(path=path) + + key = self.determine_file_key(path=path) + + try: + self.s3.Object(bucket_name, key).load() + + return True + + except ClientError: + error_msg = "File %s does not exist in s3." % path + self.logger.error(error_msg) + + return False + + def determine_valid_s3_path(self, path): + """ + Take the provided path and determine if valid s3 URL. + :param path: Full s3 filepath. Can be in s3:// or http(s):// format. + :type path: str + :return: + """ + self.logger.debug("Validating %s" % path) + if "s3://" in path: + self.logger.debug("s3:// in path.") + return True + elif "s3" in path and ".amazonaws.com" in path: + self.logger.debug("s3 and .amazonaws.com in path") + return True + else: + self.logger.error("Path is invalid.") + return False + + def get_s3_bucket(self, bucket_name): + + return self.s3.Bucket(bucket_name) + + def read_from_s3(self, bucket_name, filename): + """ + With a given bucket and filename, read from s3. + :param bucket: + :param file: + :return: + """ + + bucket = self.get_s3_bucket(bucket_name=bucket_name) diff --git a/process_tracker/utilities/settings.py b/process_tracker/utilities/settings.py index b441415..d61cc74 100644 --- a/process_tracker/utilities/settings.py +++ b/process_tracker/utilities/settings.py @@ -1,8 +1,12 @@ # Settings manager and configuration, both for initialization and reading. import configparser +import logging import os from pathlib import Path +import tempfile + +from process_tracker.utilities.aws_utilities import AwsUtilities class SettingsManager: @@ -15,24 +19,57 @@ def __init__(self, config_location=None): self.config = configparser.ConfigParser(allow_no_value=True) + self.logger = logging.getLogger(__name__) + self.logger.setLevel("DEBUG") + + self.aws_utils = AwsUtilities() + + exists = False + if config_location is None: - home = str(Path.home()) - self.config_path = os.path.join(home, ".process_tracker/") - self.config_file = os.path.join( - self.config_path, "process_tracker_config.ini" + home = Path.home() + + self.config_path = str(home.joinpath(".process_tracker/")) + self.config_file = str( + Path(self.config_path).joinpath("process_tracker_config.ini") ) + exists = os.path.isfile(self.config_file) + else: self.config_path = config_location - self.config_file = os.path.join( - self.config_path, "process_tracker_config.ini" - ) - exists = os.path.isfile(self.config_file) + if "process_tracker_config.ini" not in self.config_path: + self.logger.debug( + "process_tracker_config.ini not present. Appending to %s" + % self.config_path + ) + + self.config_file = self.config_path + + if not self.config_file.endswith("/"): + self.config_file += "/" + + self.config_file += "process_tracker_config.ini" + + self.logger.debug("Config file is now %s" % self.config_file) + else: + self.logger.debug( + "process_tracker_config.ini present. Setting config_path to config_file." + ) + self.config_file = self.config_path + + if self.aws_utils.determine_valid_s3_path( + path=self.config_path + ) and self.aws_utils.determine_s3_file_exists(path=self.config_file): + + exists = True if exists: self.read_config_file() else: + # How to handle if exists is false and it's s3? + self.create_config_file() def create_config_file(self): @@ -62,4 +99,22 @@ def read_config_file(self): :return: """ - return self.config.read(self.config_file) + if self.aws_utils.determine_valid_s3_path( + path=self.config_path + ) and self.aws_utils.determine_s3_file_exists(path=self.config_file): + + temp_file = tempfile.NamedTemporaryFile() + bucket_name = self.aws_utils.determine_bucket_name(path=self.config_path) + + bucket = self.aws_utils.get_s3_bucket(bucket_name=bucket_name) + key = self.aws_utils.determine_file_key(path=self.config_file) + + bucket.download_file(key, temp_file.name) + + with open(temp_file.name, "r") as f: + self.config.readfp(f) + temp_file.close() + + else: + + return self.config.read(self.config_file) diff --git a/tests/fixtures/process_tracker_config.ini b/tests/fixtures/process_tracker_config.ini new file mode 100644 index 0000000..2a69119 --- /dev/null +++ b/tests/fixtures/process_tracker_config.ini @@ -0,0 +1,8 @@ +[DEFAULT] +log_level = DEBUG +data_store_type = postgresql +data_store_username = pt_admin_test +data_store_password = arglebargle +data_store_host = localhost +data_store_port = 1234 +data_store_name = process_tracker \ No newline at end of file diff --git a/tests/test_process_tracker.py b/tests/test_process_tracker.py index 47d9d30..b7261d6 100755 --- a/tests/test_process_tracker.py +++ b/tests/test_process_tracker.py @@ -1,6 +1,7 @@ # Tests for validating process_tracking works as expected. from datetime import datetime, timedelta +import logging import os from pathlib import Path import time @@ -29,7 +30,7 @@ ProcessTargetObject, ProcessTracking, ) -from process_tracker.models.source import Source, SourceObject +from process_tracker.models.source import Source from process_tracker.data_store import DataStore from process_tracker.extract_tracker import ExtractTracker @@ -43,6 +44,8 @@ class TestProcessTracker(unittest.TestCase): @classmethod def setUpClass(cls): + cls.logger = logging.Logger(__name__) + cls.data_store = DataStore() cls.session = cls.data_store.session cls.data_store_type = cls.data_store.data_store_type @@ -696,6 +699,10 @@ def test_register_extracts_by_location_local(self): self.assertCountEqual(expected_result, given_result) + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) @mock_s3 def test_register_extracts_by_location_s3(self): """ @@ -740,9 +747,9 @@ def test_register_extracts_by_location_s3(self): key = os.path.join(test_bucket, file) - print(file) - print(key) - print(fixtures_dir) + self.logger.debug("Filename %s" % file) + self.logger.debug("File key %s" % key) + self.logger.debug("Fixtures dir %s" % fixtures_dir) file = os.path.join(fixtures_dir, file) client.upload_file(Filename=file, Bucket=test_bucket, Key=key) diff --git a/tests/utilities/test_aws_utilities.py b/tests/utilities/test_aws_utilities.py new file mode 100644 index 0000000..ddad7a5 --- /dev/null +++ b/tests/utilities/test_aws_utilities.py @@ -0,0 +1,243 @@ +import logging +import unittest + +import boto3 +import botocore +from moto import mock_s3 +import os + +from process_tracker.utilities.aws_utilities import AwsUtilities + + +class TestAwsUtilities(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.logger = logging.Logger(__name__) + + cls.aws_util = AwsUtilities() + + def test_determine_bucket_name_valid_path_s3(self): + """ + If path provided is an AWS CLI url, parse and return the bucket name. + :return: + """ + path = "s3://test_bucket/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_valid_path_url_http(self): + """ + If path provided is an AWS http URL, parse and return the bucket name. + :return: + """ + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_valid_path_url_https(self): + """ + If path provided is an AWS https URL, parse and return the bucket name + :return: + """ + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "test_bucket" + + given_result = self.aws_util.determine_bucket_name(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_bucket_name_invalid_path(self): + """ + If path provided is an invalid s3 path, throw exception. + :return: + """ + path = "invalid.path/test_bucket/bucket_file.csv" + + with self.assertRaises(Exception) as context: + + self.aws_util.determine_bucket_name(path=path) + + return self.assertTrue( + "It appears the URL is not a valid s3 path. invalid.path/test_bucket/bucket_file.csv" + in str(context.exception) + ) + + def test_determine_file_key_valid_path_s3(self): + """ + If path provided is an AWS CLI url, parse and return the object key. + :return: + """ + path = "s3://test_bucket/folder/bucket_file.csv" + + expected_result = "folder/bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_valid_path_http(self): + """ + If path provided is an AWS http URL, parse and return the object key. + :return: + """ + + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_valid_path_https(self): + """ + If path provided is an AWS https URL, parse and return the object key. + :return: + """ + + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + expected_result = "bucket_file.csv" + + given_result = self.aws_util.determine_file_key(path=path) + + self.assertEqual(expected_result, given_result) + + def test_determine_file_key_invalid_path(self): + """ + If path provided is an invalid URL, throw an exception. + :return: + """ + + path = "invalid.path/test_bucket/bucket_file.csv" + + with self.assertRaises(Exception) as context: + + self.aws_util.determine_file_key(path=path) + + return self.assertTrue( + "It appears the URL is not valid. invalid.path/test_bucket/bucket_file.csv" + in str(context.exception) + ) + + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) + @mock_s3 + def test_determine_s3_file_exists_valid_file(self): + """ + If path provided is valid, determine if file exists or not. + :return: + """ + expected_keys = ["test_local_dir_1.csv"] + test_bucket = "test_bucket" + + path = "s3://test_bucket/test_local_dir_1.csv" + + client = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + + try: + s3 = boto3.resource( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + s3.meta.client.head_bucket(Bucket=test_bucket) + + except botocore.exceptions.ClientError: + pass + else: + err = "%s should not exist" % test_bucket + raise EnvironmentError(err) + + client.create_bucket(Bucket=test_bucket) + + current_dir = os.path.join(os.path.dirname(__file__), "..") + fixtures_dir = os.path.join(current_dir, "fixtures") + + for file in expected_keys: + key = file + self.logger.debug("Filename %s" % file) + self.logger.debug("Fixtures dir %s" % fixtures_dir) + + file = os.path.join(fixtures_dir, file) + client.upload_file(Filename=file, Bucket=test_bucket, Key=key) + + given_result = self.aws_util.determine_s3_file_exists(path=path) + expected_result = True + + self.assertEqual(expected_result, given_result) + + @mock_s3 + def test_determine_s3_file_exists_invalid_file(self): + """ + If path provided, but file does not exist is s3, throw ClientError. + :return: + """ + + path = "s3://test_bucket/test_local_dir_1.csv" + + given_result = self.aws_util.determine_s3_file_exists(path=path) + expected_result = False + + return self.assertEqual(expected_result, given_result) + + def test_determine_valid_s3_path_valid_path_s3(self): + """ + Testing that if path is an AWS CLI URL, that the path is validated. + :return: + """ + path = "s3://test_bucket/folder/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_valid_path_http(self): + """ + Testing that if a path is a valid http AWS S3 URL, that the path is validated. + :return: + """ + path = "http://test_bucket.s3.amazonaws.com/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_valid_path_https(self): + """ + Testing that if a path is a valid https AWS S3 URL, that the path is validated. + :return: + """ + path = "https://test_bucket.s3.amazonaws.com/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertTrue(given_result) + + def test_determine_valid_s3_path_invalid_path(self): + """ + Testing that if a path is not a valid URL, that the path is not validated. + :return: + """ + path = "argle.bargle/test_bucket/bucket_file.csv" + + given_result = self.aws_util.determine_valid_s3_path(path=path) + + self.assertFalse(given_result) diff --git a/tests/utilities/test_settings_manager.py b/tests/utilities/test_settings_manager.py index b63f271..941374d 100644 --- a/tests/utilities/test_settings_manager.py +++ b/tests/utilities/test_settings_manager.py @@ -1,7 +1,13 @@ import configparser +import os +from pathlib import Path import shutil import unittest +import boto3 +import botocore +from moto import mock_s3 + from process_tracker.utilities.settings import SettingsManager @@ -25,6 +31,19 @@ def test_config_location_set(self): self.assertEqual(expected_result, given_result) + def test_config_location_s3(self): + """ + Testing that if config_location is set and the path is an s3 file/location, use that instead of the home + directory. + :return: + """ + + expected_result = "s3://test_bucket/process_tracker_config.ini" + + given_result = SettingsManager(config_location="s3://test_bucket/").config_file + + self.assertEqual(expected_result, given_result) + def test_create_config_file(self): """ Testing that if the config file does not exist, it is created. @@ -37,3 +56,58 @@ def test_create_config_file(self): expected_result = "None" self.assertEqual(expected_result, given_result) + + @unittest.skipIf( + "TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", + "Skipping this test on Travis CI.", + ) + @mock_s3 + def test_read_config_file_s3(self): + """ + Testing that if config file is on s3 then the file is pulled down and read. + :return: + """ + expected_keys = ["process_tracker_config.ini"] + test_bucket = "test_bucket" + + path = "s3://test_bucket/process_tracker_config.ini" + + client = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + + try: + s3 = boto3.resource( + "s3", + region_name="us-east-1", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + s3.meta.client.head_bucket(Bucket=test_bucket) + + except botocore.exceptions.ClientError: + pass + else: + err = "%s should not exist" % test_bucket + raise EnvironmentError(err) + + client.create_bucket(Bucket=test_bucket) + + current_dir = os.path.join(os.path.dirname(__file__), "..") + fixtures_dir = os.path.join(current_dir, "fixtures") + + for file in expected_keys: + key = file + file = os.path.join(fixtures_dir, file) + client.upload_file(Filename=file, Bucket=test_bucket, Key=key) + + settings = SettingsManager(config_location=path).config + + given_result = settings["DEFAULT"]["data_store_username"] + + expected_result = "pt_admin_test" + + self.assertEqual(expected_result, given_result)