반응형
from random import seed
#from statistics import LinearRegression
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from utils.utils import *
import numpy as np
import pandas as pd
#stock_financial('005930.KS','2018-01-01','2022-01-01')
#KRX_stock_list()
#stock_price('005930.KS','2018-01-01','2022-01-01')
import pandas_datareader.data as web
#### 데이터셋 가져오기 ####
#("MSFT IBM GOOGL")
#orelly_function_datareader("MSFT IBM GOOGL")
#orelly_function_datareader("DEXJPUS DEXUSUK",'fred')
#orelly_function_datareader("SP500 dJIA VIXCLS",'fred')
#### 전처리 과정 ####
#### index_col을 0번 인덱스로 하여, name, name설정 함. ㄷㄷ
stk_data = pd.read_csv("./data_collection/datasets/orelly_MSFT IBM GOOGL.csv",index_col=[0],header=[0,1])
#stk_data_header_0 = stk_data.columns.get_level_values(level=0)
#stk_data_header_1 = stk_data.columns.get_level_values(level=1)
#stk_data_header_2 = stk_data.columns.get_level_values(level=2).str.replace('Un.*','',regex=True)
#stk_data.columns = [stk_data_header_0,stk_data_header_1,stk_data_header_2]
#print(stk_data_header_0)
#print(stk_data_header_1)
#print(stk_data_header_2)
#print(stk_data.columns)
#print(stk_data.index)
#print(stk_data)
###############################
## Attriubtes : Adj Close : 조정 종가
ccy_data = pd.read_csv("./data_collection/datasets/orelly_DEXJPUS DEXUSUK.csv",index_col=[0])
idx_data = pd.read_csv("./data_collection/datasets/orelly_SP500 dJIA VIXCLS.csv",index_col=[0])
'''
stk_tickers = ['MSFT','IBM','GOOGL']
ccy_tickers = ['DEXJPUS','DEXUSUK']
idx_tickers = ['SP500','DJIA','VIXCLS']
stk_data = web.DataReader(stk_tickers,'yahoo')
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers,'fred')
'''
return_period = 5
# diff 는 계차 구하는 것 (diff 안의 변수는 5일 지연, 5개 , 10개, 15개 ....)
## 모든행에 대해 'adj close만'
# Y는 ('Adj Close의 'MSFT' 칼럼 내 열 값을 log 취한다. -> 그 후, diff(5일 계차)구함 -> 5개행 만큼 위로 값 올림)
Y = np.log(stk_data.loc[:,('Adj Close','MSFT')]).diff(return_period).shift(-return_period)
#print(Y.index.name) # Date
#print(Y.name) # (Adj Close,MSFT)
Y.name = Y.name[-1] + '_pred'
X1 = np.log(stk_data.loc[:, ('Adj Close', ('GOOGL','IBM'))]).diff(return_period)
#print(X1)
#print(X1.columns)
#print(X1.columns.levels)
# MultiIndex 컬럼에서 튜플로 접근하는 것이 싫다면 droplevel()로 MultiIndex 컬럼을 SingleIndex 컬럼으로 변환합니다.
X1.columns = X1.columns.droplevel() # 코드상에서는 Attributes Adj Close 삭제됨 droplevel 디폴트 값 알아야됨.
#print(X1.columns)
#print(X1)
#X2 = np.log(ccy_data.loc[:,('DEXJPUS','DEXUSUK')]).diff(return_period)
X2 = np.log(ccy_data).diff(return_period)
X3 = np.log(idx_data).diff(return_period)
#### X4는 diff(5) diff(15) diff(30) diff(60) 계산-> axis 1이므로 오른쪽에 붙임 그리고 dropna null 날림
X4 = pd.concat([np.log(stk_data.loc[:,('Adj Close','MSFT')]).diff(i)\
for i in [return_period, return_period*3,\
return_period*6,return_period*12]],axis=1).dropna()
#print(X4)
#print(X4.columns)
X4.columns = ['MSFT_DT','MSFT_3DT','MSFT_6DT','MSFT_12DT'] # multi column -> single column 됨
#print(X4)
#print(X1)
#print(X2)
#print(X3)
###########여기부터 다시 공부 시작 #####
X = pd.concat([X1,X2,X3,X4],axis=1)
#print(X)
dataset = pd.concat([Y,X],axis=1).dropna().iloc[::return_period,:]
Y=dataset.loc[:,Y.name]
X=dataset.loc[:,X.columns]
#print(dataset.head())
import matplotlib.pyplot as plt
'''
correlation = dataset.corr()
plt.figure(figsize=(15,15))
plt.title('Correlation Matrix')
import seaborn as sns
sns.heatmap(correlation,vmax=1, square=True,annot=True,cmap='cubehelix')
plt.show()
'''
################################p126######################
'''
from pandas.plotting import scatter_matrix
plt.figure(figsize=(15,15))
scatter_matrix(dataset,figsize=(12,12))
plt.show()
'''
##########################################################
#시계열 분석#
'''
import statsmodels.api as sm
res = sm.tsa.seasonal_decompose(Y,period=52) ### freq 대신 period??????????????????? 결과가 다름.
fig= res.plot()
fig.set_figheight(8)
fig.set_figwidth(15)
plt.show()
'''
##########################################################
# 5. 훈련.
validation_size = 0.2
train_size = int(len(X) * (1-validation_size))
print(len(X))
X_train, X_test = X[0:train_size],X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size],Y[train_size:len(Y)]
print(X,Y)
### 테스트 옵션 및 평가 메트릭.(4장 참고)
num_folds = 10
scoring = 'neg_mean_squared_error'
###사이킷런을 이용한 머신러닝 모델
models = []
#### 회귀와 회귀 트리 알고리즘
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
models.append(('LR',LinearRegression()))
models.append(('LASSO',Lasso()))
models.append(('EN',ElasticNet()))
models.append(('KNN',KNeighborsRegressor()))
models.append(('CART',DecisionTreeRegressor()))
models.append(('SVR',SVR()))
# 신경망 알고리즘
from sklearn.neural_network import MLPRegressor
models.append(('MLP',MLPRegressor()))
#앙상블 모델
######### 부스팅 방법
models.append(('ABR',AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
## 배깅 방법
models.append(('RFR',RandomForestRegressor()))
models.append(('ETR',ExtraTreesRegressor()))
#################################
from sklearn.model_selection import KFold, cross_val_score
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
#print(name,model)
names.append(name) ## k-겹 분석
### kfold 교차 검증 ###
kfold = KFold(n_splits = num_folds, random_state= 100,shuffle=True) #Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True. 발생시, shuffle=true 넣어주어야 한다.
#print(kfold) KFold(n_splits=10, random_state=100, shuffle=True)
#평균 제곱 오차(MSE)를 양수로 변환함. 낮을 수록 좋음
cv_results = -1*cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)#########오류 원인 분석중
"""
#print(cv_results)
[0.00037173 0.00141106 0.00114862 0.00121909 0.00123947 0.00228879 0.00073832 0.00092809 0.00031207 0.00081958] = 와 같이 cv_results 값 도출
"""
##예외가 발생했습니다. ValueError Cannot have number of splits n_splits=10 greater than the number of samples: n_samples=0.
### 해결ㄹ 못함 --> validaiotn 값 변경함
### Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.
### 'KNN' KNeighborsRegressor 로 작성해서 오류났었음.
kfold_results.append(cv_results)
# 총 훈련 기간
res = model.fit(X_train,Y_train)
train_result = mean_squared_error(res.predict(X_train),Y_train) # 학습이 완료된 모델에 predict를 넣으면 결과값을 반환
train_results.append(train_result)
# 테스트 결과
test_result = mean_squared_error(res.predict(X_test),Y_test)
test_results.append(test_result)
### 교차 검증 결과
fig = plt.figure()
fig.suptitle('Algorithm Comparison: Kfold results')
ax = fig.add_subplot(111)
plt.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
plt.show()
#### 훈련 및 테스트 오차 ####
fig = plt.figure()
ind = np.arange(len(names)) # 그룹의 x 위치
width = 0.35 # 막대 폭
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(ind - width/2,train_results, width=width, label='Train Error')
plt.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
plt.show()
반응형
'IT 인터넷 > 머신러닝' 카테고리의 다른 글
파생 상품(옵션) - KFold (0) | 2022.05.17 |
---|---|
주가 예측 - ARIMA(시계열 모델 기반) (0) | 2022.05.16 |
아나콘다 dlib 설치 오류 해결방법 (0) | 2022.01.15 |