from pstats import Stats
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, validation_curve
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
""" 1번 함수 및 파라미터 정의"""
true_alpha = 0.1
true_beta = 0.1
true_sigma0 = 0.2
risk_free_rate = 0.05 # 블랙-숄즈 옵션 가객 책정 모델의 입력인 이자율 정의
def option_vol_from_surface(moneyness, time_to_maturity):
return true_sigma0 + true_alpha * time_to_maturity + true_beta * np.square(moneyness -1)
def call_option_price(moneyness, time_to_maturity, option_vol):
d1 = (np.log(1/moneyness) + (risk_free_rate + np.square(option_vol))* time_to_maturity) / (option_vol*np.sqrt(time_to_maturity))
d2 = (np.log(1/moneyness)+ (risk_free_rate-np.square(option_vol)) * time_to_maturity)/ (option_vol*np.sqrt(time_to_maturity))
N_d1 = stats.norm.cdf(d1)
N_d2 = stats.norm.cdf(d2) # norm은 누적분포
#N_d1 = stats.uniform.cdf(d1) # uniform은 균등분포
#N_d2 = stats.uniform.cdf(d2)
return N_d1 - moneyness * np.exp(-risk_free_rate*time_to_maturity) * N_d2
""" 2번 데이터 생성"""
#### Price(옵션 가격) , moneyness(예측되는 변수 옵션 가격도)(행사가와 현물가의 비율), 만기일(time to maturity), volatility(변동성)
N = 10000
Ks = 1+ 0.25*np.random.randn(N)
Ts = np.random.random(N)
Sigmas = np.array([option_vol_from_surface(k,t) for k,t in zip(Ks,Ts)])
Ps = np.array([call_option_price(k,t,sig) for k,t,sig in zip(Ks,Ts,Sigmas)])
Y = Ps
X = np.concatenate([Ks.reshape(-1,1),Ts.reshape(-1,1),Sigmas.reshape(-1,1)],axis=1)
dataset = pd.DataFrame(np.concatenate([Y.reshape(-1,1),X],axis=1),columns=['Price','Moneyness','Time','vol'])
print(dataset.head())
""" 3번 데이터 시각화"""
from pandas.plotting import scatter_matrix
plt.figure(figsize=(15,15))
scatter_matrix(dataset,figsize=(12,12))
plt.show()
""" 데이터 준비 및 분석(단변량 특성 선택) """
bestfeatures = SelectKBest(k='all',score_func=f_regression)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(['Moneyness','Time','Vol'])
# 더 나은 시각화를 위해 두 개의 데이터프레임 연결
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
# 데이터프레임 열 이름 지정
featureScores.nlargest(10,'Score').set_index('Specs')
""" 5번 모델 평가"""
validation_size = 0.2
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'
# 5.2
models = []
models.append(('LR',LinearRegression()))
models.append(('KNN',KNeighborsRegressor()))
models.append(('CART',DecisionTreeRegressor()))
models.append(('SVR',SVR()))
models.append(('MLP',MLPRegressor()))
# 부스팅 방법
models.append(('ABR',AdaBoostRegressor()))
models.append(('GBR',GradientBoostingRegressor()))
# 배깅 방법
## 배깅 방법
models.append(('RFR',RandomForestRegressor()))
models.append(('ETR',ExtraTreesRegressor()))
#################################
from sklearn.model_selection import KFold, cross_val_score
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
#print(name,model)
names.append(name) ## k-겹 분석
### kfold 교차 검증 ###
kfold = KFold(n_splits = num_folds, random_state= seed,shuffle=True) #Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True. 발생시, shuffle=true 넣어주어야 한다.
#print(kfold) KFold(n_splits=10, random_state=100, shuffle=True)
#평균 제곱 오차(MSE)를 양수로 변환함. 낮을 수록 좋음
cv_results = -1*cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)#########오류 원인 분석중
"""
#print(cv_results)
[0.00037173 0.00141106 0.00114862 0.00121909 0.00123947 0.00228879 0.00073832 0.00092809 0.00031207 0.00081958] = 와 같이 cv_results 값 도출
"""
##예외가 발생했습니다. ValueError Cannot have number of splits n_splits=10 greater than the number of samples: n_samples=0.
### 해결ㄹ 못함 --> validaiotn 값 변경함
### Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.
### 'KNN' KNeighborsRegressor 로 작성해서 오류났었음.
kfold_results.append(cv_results)
# 총 훈련 기간
res = model.fit(X_train,Y_train)
train_result = mean_squared_error(res.predict(X_train),Y_train) # 학습이 완료된 모델에 predict를 넣으면 결과값을 반환
train_results.append(train_result)
# 테스트 결과
test_result = mean_squared_error(res.predict(X_test),Y_test)
test_results.append(test_result)
### 교차 검증 결과
fig = plt.figure()
fig.suptitle('Algorithm Comparison: Kfold results')
ax = fig.add_subplot(111)
plt.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
plt.show()
#### 훈련 및 테스트 오차 ####
fig = plt.figure()
ind = np.arange(len(names)) # 그룹의 x 위치
width = 0.35 # 막대 폭
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(ind - width/2,train_results, width=width, label='Train Error')
plt.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
plt.show()
""" 6번 모델 튜닝 및 모델 확정"""
'''
Hidden_layer_sizes : 튜플, 길이 = n_layers-2 , 기본값 (100,)
i 번째 요소는 i 번째 은닉층에 있는 뉴런의 수를 나타냄
'''
param_grid = {'hidden_layer_sizes' : [(20,), (50,),(20,20), (20,30,20)]}
model = MLPRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,cv=kfold)
grid_result = grid.fit(X_train,Y_train)
print("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" %(mean,stdev,param))
#모델 준비
model_tuned = MLPRegressor(hidden_layer_sizes=(20,30,20))
model_tuned.fit(X_train,Y_train)
# 검증셋에 대한 정확도 추정
# 검증 데이터 변환
predictions = model_tuned.predict(X_test)
print(mean_squared_error(Y_test,predictions))
""" 7번 추가분석 : 변됴ㅗㅇ성 데이터 제거"""
X = X[:,:2]
validation_size = 0.2
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
################################################# 그림 그리기 해보기 ###################################
from sklearn.model_selection import KFold, cross_val_score
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
#print(name,model)
names.append(name) ## k-겹 분석
### kfold 교차 검증 ###
kfold = KFold(n_splits = num_folds, random_state= seed,shuffle=True) #Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True. 발생시, shuffle=true 넣어주어야 한다.
#print(kfold) KFold(n_splits=10, random_state=100, shuffle=True)
#평균 제곱 오차(MSE)를 양수로 변환함. 낮을 수록 좋음
cv_results = -1*cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)#########오류 원인 분석중
"""
#print(cv_results)
[0.00037173 0.00141106 0.00114862 0.00121909 0.00123947 0.00228879 0.00073832 0.00092809 0.00031207 0.00081958] = 와 같이 cv_results 값 도출
"""
##예외가 발생했습니다. ValueError Cannot have number of splits n_splits=10 greater than the number of samples: n_samples=0.
### 해결ㄹ 못함 --> validaiotn 값 변경함
### Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.
### 'KNN' KNeighborsRegressor 로 작성해서 오류났었음.
kfold_results.append(cv_results)
# 총 훈련 기간
res = model.fit(X_train,Y_train)
train_result = mean_squared_error(res.predict(X_train),Y_train) # 학습이 완료된 모델에 predict를 넣으면 결과값을 반환
train_results.append(train_result)
# 테스트 결과
test_result = mean_squared_error(res.predict(X_test),Y_test)
test_results.append(test_result)
### 교차 검증 결과
fig = plt.figure()
fig.suptitle('Algorithm Comparison: Kfold results')
ax = fig.add_subplot(111)
plt.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
plt.show()
#### 훈련 및 테스트 오차 ####
fig = plt.figure()
ind = np.arange(len(names)) # 그룹의 x 위치
width = 0.35 # 막대 폭
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(ind - width/2,train_results, width=width, label='Train Error')
plt.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
plt.show()