しょっちゅう忘れることを書いておく。
![]() |
66 |
510 views
# coding: UTF-8
import os
import numpy as np
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from util import read_csv
def create_train_data():
records = read_csv(r"data/train.csv")
dataset = []
labels = []
for record in records:
dataset.append(record[0:-1])
labels.append(record[-1])
# 学習データとテストデータを分ける
train_dataset, eval_dataset, train_labels, eval_labels = train_test_split(dataset, labels, test_size=0.1, random_state=0)
# 正規化する
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
# 学習データの正規化
scaler.fit(train_dataset)
train_dataset = scaler.transform(train_dataset)
# テストデータの正規化
scaler.fit(eval_dataset)
eval_dataset = scaler.transform(eval_dataset)
return train_dataset, eval_dataset, train_labels, eval_labels
def load_test_data():
test_data = read_csv(r"data/test.csv")
# 正規化する
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
# 学習データの正規化
scaler.fit(test_data)
test_data = scaler.transform(test_data)
return test_data
def write_result_csv(file_name, y_pred):
if not os.path.exists("results"):
os.makedirs("results")
with open(os.path.join("results", file_name), "w") as f:
for i, y in enumerate(y_pred):
if y < 0.:
y = 0.
if y > 1.:
y = 1.
f.write(f"{i},{y:.6f}\n")
def cat_boost_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
catBoostによる2値分類
"""
print("======start cat_boost_classifier======")
# catBoost用のデータ作成
train_pool = Pool(train_dataset, label=train_labels)
eval_pool = Pool(eval_dataset, label=eval_labels)
params = {
'depth': 8,
'early_stopping_rounds' : 500,
'iterations' : 1000,
#'custom_loss' : ['Accuracy'],
'eval_metric': 'AUC',
'random_seed' : 42,
#'use_best_model': True
}
model = CatBoostClassifier(**params)
cab = model.fit(train_pool, eval_set=eval_pool)
preds_train_acc = cab.predict(train_pool)
#preds_train_acc = cab.predict_proba(train_pool)
preds_train_acc = cab.predict(train_pool)
preds_eval_acc = cab.predict(eval_pool)
train_acc = accuracy_score(train_labels, preds_train_acc)
eval_acc = accuracy_score(eval_labels, preds_eval_acc)
print(f'train_acc: {train_acc:.3f} eval_acc: {eval_acc:.3f}')
# 予測データの作成
index_pred = cab.predict(test_data)
test_prob = cab.predict_proba(test_data)[:, 1]
write_result_csv("cat_boost_classifier_result.csv", test_prob)
def cat_boost_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
catBoostによる回帰
"""
print("======start cat_boost_regressor======")
# catBoost用のデータ作成
train_pool = Pool(train_dataset, label=train_labels)
eval_pool = Pool(eval_dataset, label=eval_labels)
params = {
'early_stopping_rounds' : 100,
'iterations' : 1000,
'loss_function' :'MAE',
'random_seed' :42
}
model = CatBoostRegressor(**params)
cab = model.fit(train_pool, eval_set=eval_pool)
preds = cab.predict(eval_pool)
eval_mse = mean_squared_error(eval_labels, preds)
train_score = model.score(train_dataset, train_labels)
eval_score = model.score(eval_dataset, eval_labels)
print(f"MSE: {eval_mse:.3f} train_score: {train_score:.3f} eval_score:{eval_score:.3f}")
# 予測データの作成
test_preds = cab.predict(test_data)
write_result_csv("cat_boost_regressor_result.csv", test_preds)
def light_gbm_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
LightGBMによる2値分類
"""
# LightGBM のハイパーパラメータ
lgbm_params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'early_stopping_rounds':200,
'num_leaves': 4,
'learning_rate': 0.05,
'bagging_freq': 5,
'lambda_l1': 2,
'lambda_l2': 2,
'n_estimators': 80,
'min_data_in_leaf': 20,
'seed': 20111019,
'max_depth': 7
}
# 上記のパラメータでモデルを学習する
model = LGBMClassifier(**lgbm_params)
model.fit(train_dataset, train_labels,
eval_set=[(eval_dataset, eval_labels)])
# 予測とスコア
preds_train_acc = model.predict(train_dataset)
preds_eval_acc = model.predict(eval_dataset)
train_acc = accuracy_score(train_labels, preds_train_acc.round(0))
eval_acc = accuracy_score(eval_labels, preds_eval_acc.round(0))
print(f'train_acc: {train_acc:.3f} eval_acc: {eval_acc:.3f}')
# 予測データの作成
test_prob = model.predict_proba(test_data)[:, 1]
write_result_csv("light_gbm_classifier_result.csv", test_prob)
def light_gbm_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
LightGBMによる回帰
"""
# LightGBM のハイパーパラメータ
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mse',
'num_leaves': 64,
'max_depth': 8,
'min_data_in_leaf': 20,
'early_stopping_rounds':100
}
# 上記のパラメータでモデルを学習する
model = LGBMRegressor(**params)
model.fit(train_dataset, train_labels,
eval_set=[(eval_dataset, eval_labels)])
# 検証用データを予測する
preds = model.predict(eval_dataset)
# スコアの表示
eval_mse = mean_squared_error(eval_labels, preds)
train_score = model.score(train_dataset, train_labels)
eval_score = model.score(eval_dataset, eval_labels)
print(f"MSE: {eval_mse:.3f} train_score: {train_score:.3f} eval_score:{eval_score:.3f}")
# 予測データの作成
test_preds = model.predict(test_data)
write_result_csv("light_gbm_regressor_result.csv", test_preds)
def random_forest_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
ランダムフォレストによる2値分類
"""
params = {
'max_depth': 8,
'n_estimators': 1000
}
model = RandomForestClassifier(**params)
model.fit(train_dataset, train_labels)
# 検証用データを予測する
preds = model.predict(eval_dataset)
# 予測とスコア
preds_train_acc = model.predict(train_dataset)
preds_eval_acc = model.predict(eval_dataset)
train_acc = accuracy_score(train_labels, preds_train_acc)
eval_acc = accuracy_score(eval_labels, preds_eval_acc)
print(f'train_acc: {train_acc:.3f} eval_acc: {eval_acc:.3f}')
# AUCの計算
preds_eval_auc = model.predict_proba(eval_dataset)[:, 1]
auc = roc_auc_score(eval_labels, preds_eval_auc)
print(f"val_auc: {auc:.4f}")
# 予測データの作成
test_prob = model.predict_proba(test_data)[:, 1]
write_result_csv("random_forest_classifier_result.csv", test_prob)
def random_forest_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data):
"""
ランダムフォレストによる2値分類
"""
params = {
'max_depth': 8,
'n_estimators': 1000
}
model = RandomForestRegressor(**params)
model.fit(train_dataset, train_labels)
# 検証用データを予測する
preds = model.predict(eval_dataset)
# スコアの表示
eval_mse = mean_squared_error(eval_labels, preds)
train_score = model.score(train_dataset, train_labels)
eval_score = model.score(eval_dataset, eval_labels)
print(f"MSE: {eval_mse:.3f} train_score: {train_score:.3f} eval_score:{eval_score:.3f}")
# 予測データの作成
test_preds = model.predict(test_data)
write_result_csv("random_forest_regressor_result.csv", test_preds)
if __name__ == '__main__':
train_dataset, eval_dataset, train_labels, eval_labels = create_train_data()
test_data = load_test_data()
# 使いたいモデルのコメントを外してください
#cat_boost_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
#cat_boost_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
#light_gbm_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
#light_gbm_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
random_forest_classifier(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
#random_forest_regressor(train_dataset, eval_dataset, train_labels, eval_labels, test_data)
Page 44 of 69.
すぺぺぺ
本サイトの作成者。
プログラムは趣味と勉強を兼ねて、のんびり本サイトを作っています。
フレームワークはdjango。
ChatGPTで自動プログラム作成に取り組み中。
https://www.osumoi-stdio.com/novel/