Source code for optimalflow.autoPipe

#!/usr/bin/env python 

import pandas as pd 
from optimalflow.funcPP import PPtools
from optimalflow.autoPP import dynaPreprocessing
from optimalflow.utilis_func import data_splitting_tool,delete_old_log_files,update_progress
from optimalflow.autoFS import dynaFS_clf,dynaFS_reg
from optimalflow.autoCV import evaluate_model,dynaClassifier,dynaRegressor
import datetime
import numpy as np
from time import time
import os
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

def warn(*args, **kwargs):
    pass
warnings.warn = warn
import logging

path = os.getcwd()

LOG_TS = datetime.datetime.now().strftime("%Y.%m.%d.%H.%M.%S")
logs_folder = os.path.join(os.getcwd(),'logs')
if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)
log_name = os.path.join(logs_folder, f'{os.path.basename(__file__).split(".")[0]}_log_{LOG_TS}.log')

LOG_LEVEL = logging.DEBUG
DELETE_FLAG = True
TS = time()
logger = logging.getLogger(__name__)
logger.setLevel(LOG_LEVEL)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s','%d/%m %H:%M:%S')
fh = logging.FileHandler(filename = log_name)
fh.setLevel(LOG_LEVEL)
fh.setFormatter(formatter)
logger.addHandler(fh)
Test_case = f'Optimal Flow - autoCV - Auto Pipe Connector :: {LOG_TS}'
Test_comment = '-' * len(Test_case) * 3
Start_log = '#' * len(Test_case) * 3
logger.info(Start_log)
logger.info(Test_case)
logger.info(Start_log)
delete_old_log_files(directory = logs_folder ,delete_flag = DELETE_FLAG, logger = logger, extension_list = ['.log'],filename_list = ['autoPipe_log'],log_ts = LOG_TS)
logger.info(Test_comment)


[docs]class autoPipe: """This class is to build Pipeline Cluster Traversal Experiments. Parameters ---------- steps: list, default = None List of (name, transform) tuples (implementing fit & transform) that are chained, in the order in which they are chained, with the last object a model evaluation function. Example ------- .. [Example] https://Optimal-Flow.readthedocs.io/en/latest/demos.html#build-pipeline-cluster-traveral-experiments-using-autopipe References ---------- None """ def __init__(self,steps): self.step1 = steps[0][1] self.step2 = steps[1][1] self.step3 = steps[2][1] self.step4 = steps[3][1] self.step5 = steps[4][1]
[docs] def fit(self,data): """Fits and transforms a chain of Optimal Flow modules. Parameters ---------- input_data : pandas dataframe, shape = [n_samples, n_features] NOTE: The input_data should be the datasets after basic data cleaning & well feature deduction, the more features involve will result in more columns permutation outputs. Returns ------- DICT_PREP_INFO : dictionary Each key is the # of preprocessed dataset("Dataset_xxx" format, i.e. "Dataset_10"), each value stores an info string about what transforms applied. i.e. DICT_PREPROCESSING['Dataset_0'] stores value "winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean']", which means applied 1st mode of winsorization, none scaler applied, and the encoded columns names(shown the enconding approachs in the names) DICT_FEATURE_SELECTION_INFO : dictionary Each key is the # of preprocessed dataset, each value stores the name of features selected after the autoFS module. DICT_MODELS_EVALUATION : dictionary Each key is the # of preprocessed dataset, each value stores the model evaluation results with its validate dataset. DICT_DATA : dictionary Each key is the # of preprocessed dataset, and first level sub-key is the type of splitted sets(including 'DICT_Train','DICT_TEST',and'DICT_Validate'). The second level sub-key is "X" for features and "y" for label, each value stores the datasets related to the keys(Pandas Dataframe format) i.e. DICT_DATA['Dataset_0']['DICT_TEST']["X"] is the train features of Dataset_0's test dataset models_summary : Pandas Dataframe Model selection results ranking table among all composits of preprocessed datasets, selected features and all posible models with optimal parameters. NOTE - Log records will generate and save to ./logs folder automatedly. """ dyna = self.step1 DICT_PREP_DF,DICT_PREP_INFO = dyna.fit(input_data = data) print(f"Total combinations: {len(DICT_PREP_DF.keys())}") logger.info(f"Total combinations: {len(DICT_PREP_DF.keys())}") # Tracking the metrics values DICT_MODELS_EVALUATION = {} # Feature Selction tracking DICT_FEATURE_SELECTION_INFO = {} DICT_DATA = {} loop_num = 1 total_loop = len(DICT_PREP_DF.keys()) for number, key in enumerate(DICT_PREP_DF.keys()): combination_df = DICT_PREP_DF[key] start_time = time() logger.info(Test_comment) dataset_num = key.split("Dataset_",1)[1] logger.info(f"Current Running Preprocessed Dataset No. {dataset_num}:") features = combination_df.drop(dyna.label_col, axis=1) labels = combination_df[dyna.label_col] logger.info("[Features Before autoFS]: ") logger.info(list(features.columns)) custom_val_size,custom_size,custom_random_state = self.step2 X_train, y_train, X_val,y_val, X_test, y_test = data_splitting_tool(feature_cols = features, label_col = labels ,val_size = custom_val_size, test_size = custom_size, random_state = custom_random_state) tr_features = X_train tr_labels = y_train autoFS_module = self.step3 fs_num, fs_results = autoFS_module.fit(tr_features,tr_labels) DICT_FEATURE_SELECTION_INFO["Dataset_"+ str(dataset_num)] = fs_results logger.info(f"[Results Report]:") logger.info(f">>> autoFS summary - This dataset has the top {fs_num} important features: {fs_results}.") tr_features = tr_features[list(fs_results)] tr_labels = tr_labels val_features = X_val[list(fs_results)] val_labels = y_val ts_features = X_test[list(fs_results)] ts_labels = y_test DICT_PER_DATA = { "DICT_Train":{}, "DICT_Validate":{}, "DICT_TEST":{} } DICT_PER_DATA["DICT_Train"]["X"] = tr_features DICT_PER_DATA["DICT_Train"]["y"] = tr_labels DICT_PER_DATA["DICT_Validate"]["X"] = val_features DICT_PER_DATA["DICT_Validate"]["y"] = val_labels DICT_PER_DATA["DICT_TEST"]["X"] = ts_features DICT_PER_DATA["DICT_TEST"]["y"] = ts_labels DICT_DATA["Dataset_"+ str(dataset_num)] = DICT_PER_DATA autoCV_module = self.step4 cv_num,DICT_EST = autoCV_module.fit(tr_features,tr_labels) for est in DICT_EST.keys(): results = DICT_EST[est] logger.info(f">>> autoCV summary - {est} model CrossValidation with {cv_num} folds:") logger.info(' - Best Paramaters: {}\n'.format(results.best_params_)) logger.info(' - Best CV Score: {}\n'.format(results.best_score_)) evaluate_module = self.step5 if (evaluate_module.model_type == "cls"): metrics_df = pd.DataFrame(columns=['Model_Name','Accuracy','Precision','Recall','Latency','Best_Parameters']) if (evaluate_module.model_type == "reg"): metrics_df = pd.DataFrame(columns=['Model_Name','R2','MAE','MSE','RMSE','Latency','Best_Parameters']) for est in DICT_EST.keys(): optimal_scores = evaluate_module.fit(name = est, model = DICT_EST[est].best_estimator_,features = val_features, labels = val_labels) optimal_scores.append(str([i for i in DICT_EST[est].best_params_.items()])) if (evaluate_module.model_type == "cls"): metrics_df = metrics_df.append(pd.DataFrame([optimal_scores],columns=['Model_Name','Accuracy','Precision','Recall','Latency','Best_Parameters'])) logger.info('>>> {} Modle Validation Results -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s'.format(optimal_scores[0],optimal_scores[1],optimal_scores[2],optimal_scores[3],optimal_scores[4])) if (evaluate_module.model_type == "reg"): metrics_df = metrics_df.append(pd.DataFrame([optimal_scores],columns=['Model_Name','R2','MAE','MSE','RMSE','Latency','Best_Parameters'])) logger.info('>>> {} Model Validation Results -- R^2 Score: {} / Mean Absolute Error: {} / Mean Squared Error: {} / Root Mean Squared Error: {} / Latency: {}s'.format(optimal_scores[0],optimal_scores[1],optimal_scores[2],optimal_scores[3],optimal_scores[4],optimal_scores[5])) DICT_MODELS_EVALUATION["Dataset_"+ str(dataset_num)] = metrics_df logger.info(f"Total executed {round((time()-start_time)/60,4)} minutes") time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop,clear_flag = True,process_name = "autoFS & autoCV Iteration",time_est = time_est) loop_num += 1 evaluate_module = self.step5 dict_flow = DICT_MODELS_EVALUATION for key in dict_flow.keys(): dict_flow[key]['Dataset'] = key if (evaluate_module.model_type == "cls"): models_summary = pd.concat([dict_flow[i] for i in dict_flow.keys()],ignore_index=True).sort_values(by=['Accuracy','Precision','Recall','Latency'], ascending=[False,False,False,True]) models_summary = models_summary[["Dataset","Model_Name","Best_Parameters",'Accuracy','Precision','Recall','Latency']] if (evaluate_module.model_type == "reg"): models_summary = pd.concat([dict_flow[i] for i in dict_flow.keys()],ignore_index=True).sort_values(by=['R2','MAE','MSE','RMSE','Latency'], ascending=[False,True,True,True,True]) models_summary = models_summary[["Dataset","Model_Name","Best_Parameters",'R2','MAE','MSE','RMSE','Latency']] logger.info(Start_log) print(f"The top 5 Models with Best Performance Metrics:") print(models_summary.head(5)) logger.info(f"The top 5 Models with Best Performance Metrics:") logger.info(models_summary.head(5)) return(DICT_PREP_INFO,DICT_FEATURE_SELECTION_INFO,DICT_MODELS_EVALUATION,DICT_DATA,models_summary)