DACON XGBRegressor로 주식가격 예측하기
이 포스트의 코드는 데이콘 주식 종료 가격 예측 경진대회에 참여하여 작성했던 코드 입니다. 대회 링크
좋은 성적을 거두지는 못했지만, 짧은 시간동안 많은 노력을 들여 재미있게 참여했던 대회여서 블로그를 통해 코드를 공유합니다.
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
path = os.getcwd()
list_name = 'stock_list.csv'
sample_name = 'sample_submission.csv'
stock_list = pd.read_csv(os.path.join(path, list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list
종목명 | 종목코드 | 상장시장 | |
---|---|---|---|
0 | 삼성전자 | 005930 | KOSPI |
1 | SK하이닉스 | 000660 | KOSPI |
2 | NAVER | 035420 | KOSPI |
3 | 카카오 | 035720 | KOSPI |
4 | 삼성바이오로직스 | 207940 | KOSPI |
... | ... | ... | ... |
365 | 맘스터치 | 220630 | KOSDAQ |
366 | 다날 | 064260 | KOSDAQ |
367 | 제이시스메디칼 | 287410 | KOSDAQ |
368 | 크리스에프앤씨 | 110790 | KOSDAQ |
369 | 쎄트렉아이 | 099320 | KOSDAQ |
370 rows × 3 columns
start_date = '20210104'
end_date = '20211214'
Business_days = pd.DataFrame(pd.date_range(start_date, end_date, freq='B'), columns = ['Date'])
Business_days['weekday'] = Business_days.Date.apply(lambda x:x.weekday())
Business_days['weeknum'] = Business_days.Date.apply(lambda x:x.strftime('%V'))
Business_days
Date | weekday | weeknum | |
---|---|---|---|
0 | 2021-01-04 | 0 | 01 |
1 | 2021-01-05 | 1 | 01 |
2 | 2021-01-06 | 2 | 01 |
3 | 2021-01-07 | 3 | 01 |
4 | 2021-01-08 | 4 | 01 |
... | ... | ... | ... |
242 | 2021-12-08 | 2 | 49 |
243 | 2021-12-09 | 3 | 49 |
244 | 2021-12-10 | 4 | 49 |
245 | 2021-12-13 | 0 | 50 |
246 | 2021-12-14 | 1 | 50 |
247 rows × 3 columns
- KOSPI 정보 추가
KOSPI = fdr.DataReader('KS11', start_date, end_date).reset_index()
KOSPI.name = 'kospi'
def col_rename(data_set):
for i in data_set.columns:
if i =='Date':
pass
else:
data_set.rename(columns={i:data_set.name+'_'+i}, inplace=True)
return data_set
col_rename(KOSPI)
Date | kospi_Close | kospi_Open | kospi_High | kospi_Low | kospi_Volume | kospi_Change | |
---|---|---|---|---|---|---|---|
0 | 2021-01-04 | 2944.45 | 2874.50 | 2946.54 | 2869.11 | 1.030000e+09 | 0.0247 |
1 | 2021-01-05 | 2990.57 | 2943.67 | 2990.57 | 2921.84 | 1.520000e+09 | 0.0157 |
2 | 2021-01-06 | 2968.21 | 2993.34 | 3027.16 | 2961.37 | 1.790000e+09 | -0.0075 |
3 | 2021-01-07 | 3031.68 | 2980.75 | 3055.28 | 2980.75 | 1.520000e+09 | 0.0214 |
4 | 2021-01-08 | 3152.18 | 3040.11 | 3161.11 | 3040.11 | 1.300000e+09 | 0.0397 |
... | ... | ... | ... | ... | ... | ... | ... |
231 | 2021-12-08 | 3001.80 | 3017.93 | 3036.13 | 2995.34 | 4.882500e+08 | 0.0034 |
232 | 2021-12-09 | 3029.57 | 3007.00 | 3029.57 | 3001.55 | 5.134400e+08 | 0.0093 |
233 | 2021-12-10 | 3010.23 | 3008.70 | 3017.64 | 2998.29 | 4.516000e+08 | -0.0064 |
234 | 2021-12-13 | 3001.66 | 3019.67 | 3043.83 | 3000.51 | 3.758300e+08 | -0.0028 |
235 | 2021-12-14 | 2987.95 | 2983.95 | 3001.70 | 2976.16 | 5.815700e+08 | -0.0046 |
236 rows × 7 columns
data = pd.merge(Business_days, KOSPI, how = 'outer')
- KOSDAQ 정보 추가
KOSDAQ = fdr.DataReader('KQ11', start_date, end_date).reset_index()
KOSDAQ.name = 'kosdaq'
col_rename(KOSDAQ)
Date | kosdaq_Close | kosdaq_Open | kosdaq_High | kosdaq_Low | kosdaq_Volume | kosdaq_Change | |
---|---|---|---|---|---|---|---|
0 | 2021-01-04 | 977.62 | 968.86 | 977.62 | 960.52 | 1.700000e+09 | 0.0095 |
1 | 2021-01-05 | 985.76 | 976.43 | 985.76 | 965.53 | 1.810000e+09 | 0.0083 |
2 | 2021-01-06 | 981.39 | 987.25 | 990.88 | 977.37 | 1.980000e+09 | -0.0044 |
3 | 2021-01-07 | 988.86 | 983.28 | 993.91 | 982.27 | 2.260000e+09 | 0.0076 |
4 | 2021-01-08 | 987.79 | 990.70 | 995.22 | 978.12 | 2.560000e+09 | -0.0011 |
... | ... | ... | ... | ... | ... | ... | ... |
231 | 2021-12-08 | 1006.04 | 1006.61 | 1012.64 | 1003.92 | 1.070000e+09 | 0.0094 |
232 | 2021-12-09 | 1022.87 | 1009.29 | 1022.87 | 1009.29 | 1.060000e+09 | 0.0167 |
233 | 2021-12-10 | 1011.57 | 1016.34 | 1018.42 | 1010.02 | 1.150000e+09 | -0.0110 |
234 | 2021-12-13 | 1005.96 | 1014.27 | 1014.90 | 1005.96 | 1.340000e+09 | -0.0055 |
235 | 2021-12-14 | 1002.81 | 1001.11 | 1007.73 | 996.85 | 1.080000e+09 | -0.0031 |
236 rows × 7 columns
data = pd.merge(data, KOSDAQ, how = 'outer')
data
Date | weekday | weeknum | kospi_Close | kospi_Open | kospi_High | kospi_Low | kospi_Volume | kospi_Change | kosdaq_Close | kosdaq_Open | kosdaq_High | kosdaq_Low | kosdaq_Volume | kosdaq_Change | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-01-04 | 0 | 01 | 2944.45 | 2874.50 | 2946.54 | 2869.11 | 1.030000e+09 | 0.0247 | 977.62 | 968.86 | 977.62 | 960.52 | 1.700000e+09 | 0.0095 |
1 | 2021-01-05 | 1 | 01 | 2990.57 | 2943.67 | 2990.57 | 2921.84 | 1.520000e+09 | 0.0157 | 985.76 | 976.43 | 985.76 | 965.53 | 1.810000e+09 | 0.0083 |
2 | 2021-01-06 | 2 | 01 | 2968.21 | 2993.34 | 3027.16 | 2961.37 | 1.790000e+09 | -0.0075 | 981.39 | 987.25 | 990.88 | 977.37 | 1.980000e+09 | -0.0044 |
3 | 2021-01-07 | 3 | 01 | 3031.68 | 2980.75 | 3055.28 | 2980.75 | 1.520000e+09 | 0.0214 | 988.86 | 983.28 | 993.91 | 982.27 | 2.260000e+09 | 0.0076 |
4 | 2021-01-08 | 4 | 01 | 3152.18 | 3040.11 | 3161.11 | 3040.11 | 1.300000e+09 | 0.0397 | 987.79 | 990.70 | 995.22 | 978.12 | 2.560000e+09 | -0.0011 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
242 | 2021-12-08 | 2 | 49 | 3001.80 | 3017.93 | 3036.13 | 2995.34 | 4.882500e+08 | 0.0034 | 1006.04 | 1006.61 | 1012.64 | 1003.92 | 1.070000e+09 | 0.0094 |
243 | 2021-12-09 | 3 | 49 | 3029.57 | 3007.00 | 3029.57 | 3001.55 | 5.134400e+08 | 0.0093 | 1022.87 | 1009.29 | 1022.87 | 1009.29 | 1.060000e+09 | 0.0167 |
244 | 2021-12-10 | 4 | 49 | 3010.23 | 3008.70 | 3017.64 | 2998.29 | 4.516000e+08 | -0.0064 | 1011.57 | 1016.34 | 1018.42 | 1010.02 | 1.150000e+09 | -0.0110 |
245 | 2021-12-13 | 0 | 50 | 3001.66 | 3019.67 | 3043.83 | 3000.51 | 3.758300e+08 | -0.0028 | 1005.96 | 1014.27 | 1014.90 | 1005.96 | 1.340000e+09 | -0.0055 |
246 | 2021-12-14 | 1 | 50 | 2987.95 | 2983.95 | 3001.70 | 2976.16 | 5.815700e+08 | -0.0046 | 1002.81 | 1001.11 | 1007.73 | 996.85 | 1.080000e+09 | -0.0031 |
247 rows × 15 columns
- DOW 정보 추가
DJI = fdr.DataReader('dji', start_date, end_date).reset_index()
DJI.name = 'dji'
col_rename(DJI)
Date | dji_Close | dji_Open | dji_High | dji_Low | dji_Volume | dji_Change | |
---|---|---|---|---|---|---|---|
0 | 2021-01-04 | 30223.89 | 30627.47 | 30674.28 | 29881.82 | 476730000.0 | -0.0125 |
1 | 2021-01-05 | 30391.60 | 30204.25 | 30504.89 | 30141.78 | 350910000.0 | 0.0055 |
2 | 2021-01-06 | 30829.40 | 30362.78 | 31022.65 | 30313.07 | 500430000.0 | 0.0144 |
3 | 2021-01-07 | 31041.13 | 30901.18 | 31193.40 | 30897.86 | 430620000.0 | 0.0069 |
4 | 2021-01-08 | 31097.97 | 31069.58 | 31140.67 | 30793.27 | 385650000.0 | 0.0018 |
... | ... | ... | ... | ... | ... | ... | ... |
235 | 2021-12-08 | 35754.09 | 35716.85 | 35839.72 | 35603.95 | 363110000.0 | 0.0009 |
236 | 2021-12-09 | 35755.28 | 35722.26 | 35864.22 | 35579.91 | 335370000.0 | 0.0000 |
237 | 2021-12-10 | 35971.98 | 35830.55 | 35982.69 | 35711.90 | 344050000.0 | 0.0061 |
238 | 2021-12-13 | 35652.07 | 35906.91 | 35951.28 | 35610.42 | 419860000.0 | -0.0089 |
239 | 2021-12-14 | 35545.69 | 35605.73 | 35779.20 | 35441.74 | 414320000.0 | -0.0030 |
240 rows × 7 columns
data = pd.merge(data, DJI, how = 'outer')
- 원/달러 환율 정보 추가
EXC = fdr.DataReader('USD/KRW', start_date, end_date).reset_index()
EXC.name = 'exc'
col_rename(EXC)
Date | exc_Close | exc_Open | exc_High | exc_Low | exc_Change | |
---|---|---|---|---|---|---|
0 | 2021-01-04 | 1086.48 | 1085.73 | 1088.30 | 1080.02 | 0.0016 |
1 | 2021-01-05 | 1086.42 | 1086.69 | 1090.33 | 1082.04 | -0.0001 |
2 | 2021-01-06 | 1087.93 | 1087.40 | 1089.79 | 1083.91 | 0.0014 |
3 | 2021-01-07 | 1094.28 | 1088.03 | 1096.78 | 1085.42 | 0.0058 |
4 | 2021-01-08 | 1092.93 | 1094.35 | 1099.21 | 1088.79 | -0.0012 |
... | ... | ... | ... | ... | ... | ... |
242 | 2021-12-08 | 1175.19 | 1176.85 | 1179.31 | 1175.12 | -0.0013 |
243 | 2021-12-09 | 1178.15 | 1173.42 | 1179.37 | 1172.62 | 0.0025 |
244 | 2021-12-10 | 1180.86 | 1178.27 | 1182.82 | 1176.40 | 0.0023 |
245 | 2021-12-13 | 1184.91 | 1180.96 | 1186.16 | 1177.03 | 0.0034 |
246 | 2021-12-14 | 1185.19 | 1185.10 | 1186.38 | 1180.97 | 0.0002 |
247 rows × 6 columns
data = pd.merge(data, EXC, how = 'outer')
data.columns
Index(['Date', 'weekday', 'weeknum', 'kospi_Close', 'kospi_Open', 'kospi_High', 'kospi_Low', 'kospi_Volume', 'kospi_Change', 'kosdaq_Close', 'kosdaq_Open', 'kosdaq_High', 'kosdaq_Low', 'kosdaq_Volume', 'kosdaq_Change', 'dji_Close', 'dji_Open', 'dji_High', 'dji_Low', 'dji_Volume', 'dji_Change', 'exc_Close', 'exc_Open', 'exc_High', 'exc_Low', 'exc_Change'], dtype='object')
data
Date | weekday | weeknum | kospi_Close | kospi_Open | kospi_High | kospi_Low | kospi_Volume | kospi_Change | kosdaq_Close | ... | dji_Open | dji_High | dji_Low | dji_Volume | dji_Change | exc_Close | exc_Open | exc_High | exc_Low | exc_Change | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-01-04 | 0 | 01 | 2944.45 | 2874.50 | 2946.54 | 2869.11 | 1.030000e+09 | 0.0247 | 977.62 | ... | 30627.47 | 30674.28 | 29881.82 | 476730000.0 | -0.0125 | 1086.48 | 1085.73 | 1088.30 | 1080.02 | 0.0016 |
1 | 2021-01-05 | 1 | 01 | 2990.57 | 2943.67 | 2990.57 | 2921.84 | 1.520000e+09 | 0.0157 | 985.76 | ... | 30204.25 | 30504.89 | 30141.78 | 350910000.0 | 0.0055 | 1086.42 | 1086.69 | 1090.33 | 1082.04 | -0.0001 |
2 | 2021-01-06 | 2 | 01 | 2968.21 | 2993.34 | 3027.16 | 2961.37 | 1.790000e+09 | -0.0075 | 981.39 | ... | 30362.78 | 31022.65 | 30313.07 | 500430000.0 | 0.0144 | 1087.93 | 1087.40 | 1089.79 | 1083.91 | 0.0014 |
3 | 2021-01-07 | 3 | 01 | 3031.68 | 2980.75 | 3055.28 | 2980.75 | 1.520000e+09 | 0.0214 | 988.86 | ... | 30901.18 | 31193.40 | 30897.86 | 430620000.0 | 0.0069 | 1094.28 | 1088.03 | 1096.78 | 1085.42 | 0.0058 |
4 | 2021-01-08 | 4 | 01 | 3152.18 | 3040.11 | 3161.11 | 3040.11 | 1.300000e+09 | 0.0397 | 987.79 | ... | 31069.58 | 31140.67 | 30793.27 | 385650000.0 | 0.0018 | 1092.93 | 1094.35 | 1099.21 | 1088.79 | -0.0012 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
242 | 2021-12-08 | 2 | 49 | 3001.80 | 3017.93 | 3036.13 | 2995.34 | 4.882500e+08 | 0.0034 | 1006.04 | ... | 35716.85 | 35839.72 | 35603.95 | 363110000.0 | 0.0009 | 1175.19 | 1176.85 | 1179.31 | 1175.12 | -0.0013 |
243 | 2021-12-09 | 3 | 49 | 3029.57 | 3007.00 | 3029.57 | 3001.55 | 5.134400e+08 | 0.0093 | 1022.87 | ... | 35722.26 | 35864.22 | 35579.91 | 335370000.0 | 0.0000 | 1178.15 | 1173.42 | 1179.37 | 1172.62 | 0.0025 |
244 | 2021-12-10 | 4 | 49 | 3010.23 | 3008.70 | 3017.64 | 2998.29 | 4.516000e+08 | -0.0064 | 1011.57 | ... | 35830.55 | 35982.69 | 35711.90 | 344050000.0 | 0.0061 | 1180.86 | 1178.27 | 1182.82 | 1176.40 | 0.0023 |
245 | 2021-12-13 | 0 | 50 | 3001.66 | 3019.67 | 3043.83 | 3000.51 | 3.758300e+08 | -0.0028 | 1005.96 | ... | 35906.91 | 35951.28 | 35610.42 | 419860000.0 | -0.0089 | 1184.91 | 1180.96 | 1186.16 | 1177.03 | 0.0034 |
246 | 2021-12-14 | 1 | 50 | 2987.95 | 2983.95 | 3001.70 | 2976.16 | 5.815700e+08 | -0.0046 | 1002.81 | ... | 35605.73 | 35779.20 | 35441.74 | 414320000.0 | -0.0030 | 1185.19 | 1185.10 | 1186.38 | 1180.97 | 0.0002 |
247 rows × 26 columns
# data[['kospi_Close', 'kospi_Open', 'kospi_High',
# 'kospi_Low', 'kospi_Volume', 'kospi_Change', 'kosdaq_Close',
# 'kosdaq_Open', 'kosdaq_High', 'kosdaq_Low', 'kosdaq_Volume',
# 'kosdaq_Change', 'dji_Close', 'dji_Open', 'dji_High', 'dji_Low',
# 'dji_Volume', 'dji_Change', 'exc_Close', 'exc_Open', 'exc_High',
# 'exc_Low', 'exc_Change']] = data[['kospi_Close', 'kospi_Open', 'kospi_High',
# 'kospi_Low', 'kospi_Volume', 'kospi_Change', 'kosdaq_Close',
# 'kosdaq_Open', 'kosdaq_High', 'kosdaq_Low', 'kosdaq_Volume',
# 'kosdaq_Change', 'dji_Close', 'dji_Open', 'dji_High', 'dji_Low',
# 'dji_Volume', 'dji_Change', 'exc_Close', 'exc_Open', 'exc_High',
# 'exc_Low', 'exc_Change']].shift(5)
data.isnull().sum()
Date 0 weekday 0 weeknum 0 kospi_Close 11 kospi_Open 11 kospi_High 11 kospi_Low 11 kospi_Volume 11 kospi_Change 11 kosdaq_Close 11 kosdaq_Open 11 kosdaq_High 11 kosdaq_Low 11 kosdaq_Volume 11 kosdaq_Change 11 dji_Close 7 dji_Open 7 dji_High 7 dji_Low 7 dji_Volume 7 dji_Change 7 exc_Close 0 exc_Open 0 exc_High 0 exc_Low 0 exc_Change 0 dtype: int64
- 미국 휴장일은 전일 데이터로 fill up
dji_nan = data[data['dji_Open'].isnull()].index
data.columns
Index(['Date', 'weekday', 'weeknum', 'kospi_Close', 'kospi_Open', 'kospi_High', 'kospi_Low', 'kospi_Volume', 'kospi_Change', 'kosdaq_Close', 'kosdaq_Open', 'kosdaq_High', 'kosdaq_Low', 'kosdaq_Volume', 'kosdaq_Change', 'dji_Close', 'dji_Open', 'dji_High', 'dji_Low', 'dji_Volume', 'dji_Change', 'exc_Close', 'exc_Open', 'exc_High', 'exc_Low', 'exc_Change'], dtype='object')
for i in dji_nan:
for j in range(6,12):
n = data.iloc[i-1,-j]
data.iloc[i,-j] = n
- 미국 지수의 경우 전일 시가는 24:00전에 알 수 있지만 다른정보는 24:00 이전에 알 수 없으므로 하루를 밀어줌.
data[['dji_Close', 'dji_High', 'dji_Low',
'dji_Volume', 'dji_Change']] = data[['dji_Close', 'dji_High', 'dji_Low',
'dji_Volume', 'dji_Change']].shift(1)
- 한국 휴장 일은 data 에서 삭제.
kospi_nan = data[data['kospi_Open'].isnull()].index
exc_nan = data[data['exc_Open'].isnull()].index
nan_list = set(kospi_nan.to_list() + exc_nan.to_list())
data.drop(nan_list, inplace=True)
data.drop(0, axis=0, inplace=True)
shift_col = data.columns.drop(['Date', 'weekday', 'weeknum'])
data[shift_col] = data[shift_col].shift(5)
data = data[5:].reset_index(drop=True)
data.isnull().sum()
Date 0 weekday 0 weeknum 0 kospi_Close 0 kospi_Open 0 kospi_High 0 kospi_Low 0 kospi_Volume 0 kospi_Change 0 kosdaq_Close 0 kosdaq_Open 0 kosdaq_High 0 kosdaq_Low 0 kosdaq_Volume 0 kosdaq_Change 0 dji_Close 0 dji_Open 0 dji_High 0 dji_Low 0 dji_Volume 0 dji_Change 0 exc_Close 0 exc_Open 0 exc_High 0 exc_Low 0 exc_Change 0 dtype: int64
ms_data = data.copy()
data['weekday'] = data['weekday'].astype('str')
dummies = pd.get_dummies(data[['weekday', 'weeknum']], prefix=['weekday','weeknum'])
m_data = pd.concat([data, dummies], axis=1)
m_data.drop(['weekday','weeknum'], axis=1, inplace=True)
m_data
Date | kospi_Close | kospi_Open | kospi_High | kospi_Low | kospi_Volume | kospi_Change | kosdaq_Close | kosdaq_Open | kosdaq_High | ... | weeknum_41 | weeknum_42 | weeknum_43 | weeknum_44 | weeknum_45 | weeknum_46 | weeknum_47 | weeknum_48 | weeknum_49 | weeknum_50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-01-12 | 2990.57 | 2943.67 | 2990.57 | 2921.84 | 1.520000e+09 | 0.0157 | 985.76 | 976.43 | 985.76 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2021-01-13 | 2968.21 | 2993.34 | 3027.16 | 2961.37 | 1.790000e+09 | -0.0075 | 981.39 | 987.25 | 990.88 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2021-01-14 | 3031.68 | 2980.75 | 3055.28 | 2980.75 | 1.520000e+09 | 0.0214 | 988.86 | 983.28 | 993.91 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2021-01-15 | 3152.18 | 3040.11 | 3161.11 | 3040.11 | 1.300000e+09 | 0.0397 | 987.79 | 990.70 | 995.22 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2021-01-18 | 3148.45 | 3161.90 | 3266.23 | 3096.19 | 1.710000e+09 | -0.0012 | 976.63 | 988.38 | 993.20 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
225 | 2021-12-08 | 2899.72 | 2860.12 | 2905.74 | 2837.03 | 5.639300e+08 | 0.0214 | 977.15 | 969.90 | 977.73 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
226 | 2021-12-09 | 2945.27 | 2874.64 | 2945.27 | 2874.64 | 5.344600e+08 | 0.0157 | 977.43 | 967.34 | 978.16 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
227 | 2021-12-10 | 2968.33 | 2935.93 | 2975.44 | 2927.55 | 4.867500e+08 | 0.0078 | 998.47 | 981.65 | 998.49 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
228 | 2021-12-13 | 2973.25 | 2954.82 | 2983.50 | 2932.49 | 4.799100e+08 | 0.0017 | 991.87 | 990.07 | 994.15 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
229 | 2021-12-14 | 2991.72 | 2973.84 | 2992.31 | 2960.90 | 5.413700e+08 | 0.0062 | 996.64 | 996.89 | 999.52 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
230 rows × 78 columns
옵션/선물 만기일
# https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&qvt=0&query=%EC%98%B5%EC%85%98%EB%A7%8C%EA%B8%B0%EC%9D%BC
f_option = ['2021-01-14', '2021-02-10', '2021-03-11', '2021-04-08', '2021-05-13', '2021-06-10',
'2021-07-08', '2021-08-12', '2021-09-09', '2021-10-14', '2021-11-11', '2021-12-09']
f_futures = ['2021-03-11', '2021-06-10', '2021-09-09', '2021-12-09']
전체 모델링
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path, sample_name))
submission = sample_submission.set_index('Day')
from xgboost import XGBRegressor
xgb = XGBRegressor()
day_list = ['2021-11-29', '2021-11-30', '2021-12-01', '2021-12-02', '2021-12-03']
stock_list.drop(stock_list[stock_list['종목코드']=='031390'].index,inplace=True)
for code in tqdm(stock_list['종목코드'].values):
for day in day_list:
code_data = fdr.DataReader(code, start = start_date, end = end_date).reset_index()
code_data['y_Close'] = code_data['Close']
code_data[[
'Open', 'High', 'Low', 'Close', 'Volume', 'Change']] = code_data[[
'Open', 'High', 'Low', 'Close', 'Volume', 'Change']].shift(5)
#이동 평균선 추가 (5/10/20)
code_data['5_close'] = code_data['Close'].rolling(window=5).mean()
code_data['10_close'] = code_data['Close'].rolling(window=10).mean()
code_data['20_close'] = code_data['Close'].rolling(window=20).mean()
#6일전/7일전 종가 추가
code_data['6d_Close'] = code_data[['Close']].shift(1)
code_data['7d_Close'] = code_data[['6d_Close']].shift(1)
#6일전/7일전 종가와 전일 종가의 차이 비율 추가
code_data['diff_Close'] = (code_data['Close'] - code_data['6d_Close'])/code_data['Close']
code_data['diff_Close2'] = (code_data['Close'] - code_data['7d_Close'])/code_data['Close']
#6일전 거래량 추가
code_data['6d_Volume'] = code_data[['Volume']].shift(6)
#2일전 거래량과 전일 거래량의 차이 추가
code_data['diff_Volume'] = code_data['Volume'] - code_data['6d_Volume']
df = pd.merge(m_data, code_data, how='left', on='Date')
df = df.drop(df[df.isnull().any(axis=1)].index, axis=0)
market = stock_list[stock_list['종목코드'].str.contains(code)]['상장시장'].values[0]
if market == 'KOSPI':
df_kospi = df.drop(['kosdaq_Close', 'kosdaq_Open', 'kosdaq_High', 'kosdaq_Low', 'kosdaq_Volume', 'kosdaq_Change'], axis=1)
x_train = df_kospi[df_kospi['Date'] < day].drop(['Date','y_Close'], axis=1)
y_train = df_kospi[df_kospi['Date'] < day]['y_Close']
xgb.fit(x_train, y_train)
pred = xgb.predict(df_kospi[df_kospi['Date']==day].drop(['Date','y_Close'], axis=1))
else:
df_kosdaq = df.drop(['kospi_Close', 'kospi_Open', 'kospi_High', 'kospi_Low','kospi_Volume', 'kospi_Change', 'kosdaq_Close', 'kosdaq_Open'], axis=1)
x_train = df_kosdaq[df_kosdaq['Date'] < day].drop(['Date','y_Close'], axis=1)
y_train = df_kosdaq[df_kosdaq['Date'] < day]['y_Close']
xgb.fit(x_train, y_train)
pred = xgb.predict(df_kosdaq[df_kosdaq['Date']==day].drop(['Date','y_Close'], axis=1))
submission.loc[day,code] = pred
100%|████████████████████████████████████████████████████████████████████████████████| 369/369 [06:12<00:00, 1.01s/it]
# privite submission만 데이터셋으로 작성
submission = submission.iloc[5:,:]
submission
000060 | 000080 | 000100 | 000120 | 000150 | 000240 | 000250 | 000270 | 000660 | 000670 | ... | 330860 | 336260 | 336370 | 347860 | 348150 | 348210 | 352820 | 357780 | 363280 | 950130 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||||||
2021-11-29 | 29417.746094 | 33234.343750 | 60132.703125 | 136056.812500 | 119314.929688 | 16740.277344 | 45884.242188 | 82530.968750 | 107088.679688 | 683745.0625 | ... | 46994.058594 | 49025.132812 | 89673.695312 | 37659.074219 | 19519.048828 | 50946.542969 | 372958.59375 | 265210.156250 | 25597.248047 | 16075.208008 |
2021-11-30 | 30002.962891 | 30610.638672 | 59678.476562 | 129827.921875 | 117337.351562 | 15388.730469 | 44593.121094 | 79988.164062 | 114570.687500 | 653516.1875 | ... | 43664.417969 | 50585.960938 | 97681.562500 | 35288.453125 | 20594.652344 | 50690.828125 | 379092.87500 | 264948.375000 | 23948.753906 | 16622.246094 |
2021-12-01 | 29913.453125 | 29738.423828 | 58936.699219 | 125720.078125 | 116199.320312 | 15408.984375 | 45524.351562 | 79979.945312 | 115808.617188 | 643901.2500 | ... | 44753.972656 | 49181.937500 | 97690.156250 | 36902.507812 | 19222.310547 | 46054.750000 | 367844.65625 | 259253.796875 | 23623.125000 | 17566.416016 |
2021-12-02 | 30087.794922 | 30333.685547 | 57824.656250 | 126370.695312 | 114069.851562 | 15501.014648 | 44904.937500 | 80623.687500 | 116014.000000 | 648058.5000 | ... | 46721.980469 | 49454.167969 | 100551.687500 | 36566.273438 | 20361.396484 | 48312.214844 | 375545.75000 | 272749.937500 | 24086.357422 | 16994.531250 |
2021-12-03 | 31241.271484 | 29993.458984 | 60009.445312 | 128049.921875 | 116856.796875 | 16200.601562 | 44051.550781 | 80780.773438 | 115700.054688 | 662439.3750 | ... | 48972.601562 | 46961.933594 | 98532.125000 | 37583.468750 | 21500.310547 | 49715.628906 | 343600.87500 | 269958.593750 | 25039.224609 | 17610.996094 |
5 rows × 370 columns
실제 종가와 비교
true_close = sample_submission.copy().set_index('Day')
code_data = fdr.DataReader('000060', start = '2021-11-01', end = '2021-11-01')['Close'].reset_index()
for code in tqdm(stock_list['종목코드'].values):
for day in day_list:
code_data = fdr.DataReader(code, start = day, end = day)['Close'].reset_index()
true_close.loc[day,code] = code_data['Close'].values[0]
100%|████████████████████████████████████████████████████████████████████████████████| 369/369 [02:19<00:00, 2.65it/s]
true_close = true_close.iloc[5:,:]
true_close
000060 | 000080 | 000100 | 000120 | 000150 | 000240 | 000250 | 000270 | 000660 | 000670 | ... | 330860 | 336260 | 336370 | 347860 | 348150 | 348210 | 352820 | 357780 | 363280 | 950130 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||||||
2021-11-29 | 31200.0 | 30300.0 | 58900.0 | 129000.0 | 110500.0 | 15200.0 | 43250.0 | 79200.0 | 116000.0 | 641000.0 | ... | 43750.0 | 48750.0 | 100500.0 | 37100.0 | 19800.0 | 49100.0 | 369000.0 | 266300.0 | 24200.0 | 17650.0 |
2021-11-30 | 31300.0 | 29000.0 | 57800.0 | 124000.0 | 108500.0 | 15150.0 | 42150.0 | 77800.0 | 114000.0 | 632000.0 | ... | 43300.0 | 49250.0 | 97800.0 | 35550.0 | 18600.0 | 45500.0 | 364500.0 | 255800.0 | 23100.0 | 19500.0 |
2021-12-01 | 31700.0 | 29400.0 | 57700.0 | 125000.0 | 112000.0 | 15500.0 | 42750.0 | 81200.0 | 116500.0 | 630000.0 | ... | 48950.0 | 48700.0 | 98500.0 | 36050.0 | 18900.0 | 46200.0 | 352500.0 | 264200.0 | 23850.0 | 18650.0 |
2021-12-02 | 32150.0 | 29550.0 | 60100.0 | 129000.0 | 110000.0 | 16150.0 | 43100.0 | 81600.0 | 120000.0 | 639000.0 | ... | 51900.0 | 46250.0 | 95100.0 | 33800.0 | 18650.0 | 48550.0 | 330000.0 | 274700.0 | 25200.0 | 18050.0 |
2021-12-03 | 32700.0 | 30600.0 | 60300.0 | 131000.0 | 108500.0 | 16400.0 | 44900.0 | 82500.0 | 118000.0 | 642000.0 | ... | 51900.0 | 46800.0 | 95300.0 | 34500.0 | 19100.0 | 49000.0 | 354500.0 | 275900.0 | 25800.0 | 18150.0 |
5 rows × 370 columns
diff = np.mean(np.abs(submission-true_close)/true_close)
np.mean(np.mean(np.abs(submission[:5] - true_close[:5])/true_close[:5]))*100
4.453063727451483
댓글남기기