Commit d3f31dc3 authored by Juan Santiago Garcia Pose's avatar Juan Santiago Garcia Pose
Browse files

Merge branch 'entrega_2' into 'main'

Entrega 2

See merge request !1
parents 4a6de1a6 b6b7f48b
.idea/
*/*/__pycache__/
*/__pycache__/
Borrar/
\ No newline at end of file
......@@ -21,6 +21,9 @@ wordcloud = "*"
nltk = "*"
contractions = "*"
scipy = "*"
tensorflow = "*"
keras = "*"
xgboost = "*"
[dev-packages]
......
This diff is collapsed.
......@@ -29,7 +29,7 @@ Taller 2:
- IMDB Dataset of 50K Movie Reviews - [Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
### Demo
#### Demo
- Ejecutar el demo de la transformación custom
```bash
......@@ -41,7 +41,7 @@ python3 model_generation.py --demo=true
python3 model_generation.py --demo=true --demo_sentence="<p> This is </p> the best demo, don't you agree?"
```
### Evaluar modelo
#### Evaluar modelo
```bash
python3 model_generation.py
......@@ -50,4 +50,40 @@ python3 model_generation.py
- Para buscar los parámetros con RandomizedGridSearchCV
```bash
python3 model_generation.py --fit_grid_search=true
```
___
## Entrega 2
**Es necesario estar dentro de la carpeta de la entrega**
Taller 3 y 5:
- Bike Sharing Demand - [Kaggle](https://www.kaggle.com/competitions/bike-sharing-demand/overview)
#### Etrenar modelo
- Ejecutar el entrenamiento de los modelos
```bash
python3 model_generation.py --action=train
```
#### Optimizar
- Ejecutar la optimización de hiperparámetros con Optuna. Se puede especificar el numero de trials con el parámetro `n_trials`.
Por defecto el valor de `n_trials` es 10.
```bash
python3 model_generation.py --action=find_model --n_trials=100
```
#### Evaluar modelo
```bash
python3 model_generation.py --action=evaluate
```
#### Generar submission
- Genera el archivo .csv con las predicciones del conjunto de test para la plataformad de Kaggle. Se almacena en la raíz
de la carpeta de la entrega.
```bash
python3 model_generation.py --action=predict
```
import argparse
import warnings
import joblib
import numpy as np
import optuna
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from utils.functions import rmsle, get_preprocess_pipeline, print_params, model_predict
warnings.filterwarnings('ignore')
np.random.seed(42)
# Parser
parser = argparse.ArgumentParser(
description='Entrega 2 - Taller de Aprendizaje Automático\nSantiago Garcia Pose 4.595.400-6')
parser.add_argument("--action", help="Action to perform from ['evaluate', 'train', 'find_model'].", default="evaluate")
parser.add_argument("--n_trials", help="Number of trials for Optuna study. Default is 10.", default=10)
args = parser.parse_args()
# Load and split data
df_train_full = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
y_train_full = df_train_full['count']
df_train, df_valid, y_train, y_valid = train_test_split(df_train_full, y_train_full, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
# all data
y_casual_full = df_train_full["casual"].values.copy()
y_registered_full = df_train_full["registered"].values.copy()
# train data
y_casual = df_train["casual"].values.copy()
y_registered = df_train["registered"].values.copy()
# validation data
y_valid_casual = df_valid["casual"].values.copy()
y_valid_registered = df_valid["registered"].values.copy()
# preprocessing pipeline
preprocess = get_preprocess_pipeline()
root_mean_square_log_error = make_scorer(rmsle, greater_is_better=False)
def objective(trial, casual=True):
# trial suggestions
n_estimators = trial.suggest_int("n_estimators", 50, 500, 50)
max_depth = trial.suggest_int("max_depth", 4, 20)
min_split_loss = trial.suggest_float("min_split_loss", 0.0001, 10, log=True)
learning_rate = trial.suggest_float("learning_rate", 0, 1)
min_child_weight = trial.suggest_float("min_child_weight", 0, 10)
reg_lambda = trial.suggest_float("reg_lambda", 1, 5)
xgb_regressor = XGBRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
min_split_loss=min_split_loss,
learning_rate=learning_rate,
min_child_weight=min_child_weight,
reg_lambda=reg_lambda,
random_state=42
)
model_pipeline = Pipeline([
("ct", preprocess),
("reg", xgb_regressor)
])
tt = TransformedTargetRegressor(regressor=model_pipeline, func=np.log1p, inverse_func=np.exp)
tt.fit(df_train, y_train)
# evaluate model
score = -root_mean_square_log_error(tt, df_valid, y_valid_casual if casual else y_valid_registered)
return score
def train_model(df, y_casual_, y_registered_):
xgb_regressor_casual = XGBRegressor(
n_estimators=500,
max_depth=6,
min_split_loss=0.0012023618078962767,
learning_rate=0.06388160311484559,
min_child_weight=1.495235851370954,
reg_lambda=2.0408407687224943,
random_state=42
)
xgb_regressor_registered = XGBRegressor(
n_estimators=450,
max_depth=20,
min_split_loss=0.026801189550887592,
learning_rate=0.03098427702316453,
min_child_weight=3.249186054919565,
reg_lambda=4.6163867613413085,
random_state=42
)
model_pipeline_casual = Pipeline([
("ct", preprocess),
("reg", xgb_regressor_casual)
])
model_pipeline_registered = Pipeline([
("ct", preprocess),
("reg", xgb_regressor_registered)
])
tt_casual_ = TransformedTargetRegressor(model_pipeline_casual, func=np.log1p, inverse_func=np.exp)
tt_registered_ = TransformedTargetRegressor(model_pipeline_registered, func=np.log1p, inverse_func=np.exp)
print("\nFitting models ...")
tt_casual_.fit(df, y_casual_)
tt_registered_.fit(df, y_registered_)
return tt_casual_, tt_registered_
def evaluate_model(model_casual, model_registered):
y_train_pred = model_predict(model_casual, model_registered, df_train)
y_valid_pred = model_predict(model_casual, model_registered, df_valid)
train_score = rmsle(y_train, y_train_pred)
valid_score = rmsle(y_valid, y_valid_pred)
print("=" * 10 + " MODEL EVALUATION " + "=" * 10)
print()
print("X_train RMSLE:", train_score)
print("X_valid RMSLE:", valid_score)
print()
print("=" * 40)
if __name__ == "__main__":
print("*" * 50)
print("Entrega 2 - Taller de Aprendizaje Automático\nSantiago Garcia Pose 4.595.400-6")
print("*" * 20)
print()
action = args.action.lower()
n_trials = int(args.n_trials)
if action == "train":
print("Training model with best params ...")
print("Casual model --")
print_params(casual=True)
print()
print("Registered model --")
print_params(casual=False)
print()
tt_casual, tt_registered = train_model(df_train_full, y_casual_full, y_registered_full)
print("Saving models in ./models folder ...")
joblib.dump(tt_casual, "./models/tt_casual.gz", compress=9)
joblib.dump(tt_registered, "./models/tt_registered.gz", compress=9)
evaluate_model(tt_casual, tt_registered)
elif action == "find_model":
casual = lambda trial: objective(trial, casual=True)
registered = lambda trial: objective(trial, casual=False)
print(f"Starting optuna study for casual users with {n_trials} trials...")
study_casual = optuna.create_study(direction="minimize", study_name="casual")
study_casual.optimize(casual, n_trials=n_trials, timeout=None)
study_registered = optuna.create_study(direction="minimize", study_name="registered")
study_registered.optimize(registered, n_trials=n_trials, timeout=None)
for study in [study_casual, study_registered]:
print("Number of finished trials: {}".format(len(study.trials)))
print(f"Best {study.study_name} trial:")
trial = study.best_trial
print(" Value: {}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
print()
elif action == "evaluate":
print("Evaluating model with params: ")
tt_casual = joblib.load("models/tt_casual.gz")
tt_registered = joblib.load("models/tt_registered.gz")
evaluate_model(tt_casual, tt_registered)
elif action == "predict":
tt_casual = joblib.load("./models/tt_casual.gz")
tt_registered = joblib.load("./models/tt_registered.gz")
print("Making predictions for test set ...")
y_test_pred = model_predict(tt_casual, tt_registered, df_test)
print("Predictions:", y_test_pred)
print("\nSaving predictions in csv format ...")
submission = pd.DataFrame.from_dict({'datetime': df_test['datetime'], 'count': y_test_pred})
submission.to_csv("entrega_submission_xgb.csv", index=False)
print("Done.")
This diff is collapsed.
This diff is collapsed.
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
class ProcessDatetimeData(BaseEstimator, TransformerMixin):
def __init__(self):
self.imputer = SimpleImputer(strategy="median")
def fit(self, X, y=None):
X_ = X.drop(columns="datetime", errors="ignore")
self.imputer.fit(X_)
return self
def transform(self, X, y=None):
X_ = X.drop(columns="datetime", errors="ignore")
columns = X_.columns
X_ = self.imputer.transform(X_)
X_ = pd.DataFrame(data=X_, columns=columns)
X_['datetime'] = pd.to_datetime(X['datetime'].copy())
X_["year"] = X_["datetime"].dt.year
X_["month"] = X_["datetime"].dt.month
X_["week"] = X_["datetime"].dt.isocalendar().week
X_["week"] = X_["week"].astype(int)
X_['hour'] = X_['datetime'].dt.hour
X_['weekday'] = X_['datetime'].dt.weekday
X_['temp/humidity'] = X_['temp'] / (X_['humidity'] + 1) * (X_['hour'] + 1)
X_['temp*windspeed'] = X_['temp'] * X_['windspeed'] * (X_['hour'] + 1)
X_['humidity/windspeed'] = X_['humidity'] / (X_['windspeed'] + 1)
for col in ['count', 'casual', 'registered', 'datetime']:
if col in list(X_):
X_.drop(columns=col, inplace=True)
return X_
\ No newline at end of file
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from utils.ProcessDatetimeData import ProcessDatetimeData
def rmsle(y_true, y_pred):
y_true = np.array(y_true)
y_true = np.maximum(0, y_true)
y_pred = np.array(y_pred)
y_pred = np.maximum(0, y_pred)
msle = mean_squared_log_error(y_true, y_pred)
return np.sqrt(msle)
def print_params(casual=True):
registered_params = """Params:
n_estimators: 450
max_depth: 20
min_split_loss: 0.026801189550887592
learning_rate: 0.03098427702316453
min_child_weight: 3.249186054919565
reg_lambda: 4.6163867613413085"""
casual_params = """Params:
n_estimators: 500
max_depth: 6
min_split_loss: 0.0012023618078962767
learning_rate: 0.06388160311484559
min_child_weight: 1.495235851370954
reg_lambda: 2.0408407687224943"""
return print(casual_params) if casual else print(registered_params)
def get_preprocess_pipeline():
num_columns = ['datetime', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']
cat_pipe = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore"))
])
num_pipe = Pipeline([
("features", ProcessDatetimeData())
])
preprocess = ColumnTransformer([
("cat", cat_pipe, ["season"]),
("num", num_pipe, num_columns),
])
return preprocess
def model_predict(model_casual, model_registered, df):
y_casual_pred = model_casual.predict(df)
y_registered_pred = model_registered.predict(df)
y_pred = y_casual_pred + y_registered_pred
return y_pred
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment