用于预测市场走势和回测的 ML 分类算法
作者:Sabir Jana
文章:ML Classification Algorithms to Predict Market Movements and Backtesting
将使用基于多种机器学习分类算法的股票交易策略来预测市场走势。总体方法如下:
- 收集历史数据
- 建立特征工程
- 构建机器学习算法
- 使用Backtrader进行回测
- 回测绩效分析
准备数据
在此使用ETF 516510.SH 作为分析标的,计算每日对数收益率,绘制趋势图。通过可视化收盘价和每日收益率,以检查数据。
data_prices['returns'] = np.log(data_prices['close']/data_prices['close'].shift(1))
data_prices.dropna(inplace=True)
data_prices['direction'] = np.sign(data_prices['returns']).astype(int)
# 绘制 每日收益率、close的趋势图
fig, ax = plt.subplots(2,1,sharex=True, figsize=(12,6))
ax[0].plot(data_prices['close'], label=f'{ts_code} - close')
ax[0].set(title=f'{ts_code} Close Price', ylabel='Price')
ax[0].grid(True)
ax[0].legend()
ax[1].plot(data_prices['returns'],label='Daily Returns')
ax[1].set(title=f'{ts_code} Returns', ylabel='rtn')
ax[1].grid(True)
plt.legend()
plt.tight_layout()
# plt.savefig('images/chart1', dpi=300)
#data_prices.head(5)
特征工程
选择滞后收益率为特征项。先计算标的滞后收益率,再根据滞后收益率转换为二进制数据(0,1)。
# define the number of lags
lags = [1,2,3,4,5]
# compute lagged log returns
cols = []
for lag in lags:
col = f'rtn_lag{lag}'
data_prices[col] = data_prices['returns'].shift(lag)
cols.append(col)
data_prices.dropna(inplace=True)
# function to transform the lag returns to binary values(0,1)
def create_bins(data, bins=[0]):
global cols_bin
cols_bin = []
for col in cols:
col_bin = col + '_bin'
data[col_bin] = np.digitize(data[col],bins=bins)
cols_bin.append(col_bin)
create_bins(data_prices)
data_prices[cols + cols_bin].head(2)
rtn_lag1 | rtn_lag2 | rtn_lag3 | rtn_lag4 | rtn_lag5 | rtn_lag1_bin | rtn_lag2_bin | rtn_lag3_bin | rtn_lag4_bin | rtn_lag5_bin | |
---|---|---|---|---|---|---|---|---|---|---|
datetime | ||||||||||
2024-07-04 | 0.030382 | -0.027885 | 0.002503 | -0.012453 | 0.014963 | 1 | 0 | 1 | 0 | 1 |
2024-07-03 | -0.007509 | 0.030382 | -0.027885 | 0.002503 | -0.012453 | 0 | 1 | 0 | 1 | 0 |
构建机器学习算法
将使用 Logistic 回归、Gaussian Naive Bayes、支持向量机 (SVM)、随机森林和 MLP 分类器方法来预测市场方向 (+1, -1)。然后,我们将使用矢量化回测来评估每个模型的性能,并可视化累积回报。
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
# create a dictionary of selected algorithms
models = {
'log_reg': linear_model.LogisticRegression(),
'gauss_nb': GaussianNB(),
'svm': SVC(),
'random_forest': RandomForestClassifier(max_depth=10, n_estimators=100),
'MLP': MLPClassifier(max_iter=500),
}
# function that fits all models
def fit_models(data):
mfit = {model:models[model].fit(data[cols_bin],data['direction']) for model in models.keys()}
# function that predicts from the fitted models
def derive_positions(data):
for model in models.keys():
data['pos_' + model] = models[model].predict(data[cols_bin])
# function to evaluate all trading strategies
def evaluate_strats(data):
global strategy_rtn
strategy_rtn = []
for model in models.keys():
col = 'strategy_' + model
data[col] = data['pos_' + model] * data['returns']
strategy_rtn.append(col)
strategy_rtn.insert(0,'returns')
# fit the models
fit_models(data_prices)
# derives all position values
derive_positions(data_prices)
# evaluate all trading strategies by multiplying predicted directions to actual daily returns
evaluate_strats(data_prices)
print('Number of trades SVM=',(data_prices['pos_svm'].diff()!=0).sum())
print('Number of trades Ramdom Forest=',(data_prices['pos_random_forest'].diff()!=0).sum())
ax=data_prices[strategy_rtn].cumsum().apply(np.exp).plot(figsize=(12,6),title='Machine Learning Classifiers Return Comparison')
ax.set_ylabel('Cumulative Returns')
ax.grid(True)
plt.tight_layout()
Number of trades SVM= 382
Number of trades Ramdom Forest= 408
Backtrader策略回测
采用支持向量机 (SVM),并使用 python 库 Backtrader 执行回溯测试。
- 从 100, 000 的初始资本和 0.1% 的交易佣金开始
- 当预测值为 +1 时,我们买入,当预测值为 -1 时卖出(仅当拥有股票时)
- 全押策略 — 创建买单时,尽可能多地购买股票
- 不允许卖空。
标的采用A股市场ETF,在此更原文会有差异~
class My_PandasData(bt.feeds.PandasData):
lines = ('pos_svm',)
params = (
('fromdate', dt.datetime(2021, 4, 7)),
('todate', dt.datetime(2024, 7, 14)),
('nullvalue', 0.0),
('datetime', None),
('time', -1),
('high', 2),
('low', 3),
('open', 0),
('close', 1),
('volume', 4),
('openinterest', -1),
('pos_svm',-1)
)
class MLStrategy(bt.Strategy):
params = dict(
)
def __init__(self):
self.data_predicted = self.datas[0].pos_svm
self.data_open = self.datas[0].open
self.data_close = self.datas[0].close
self.order = None
def log(self, txt):
'''Logging function'''
dt = self.datas[0].datetime.date(0).isoformat()
print(f'{dt}, {txt}')
def notify_order(self, order):
if order.status in [order.Submitted, order.Accepted]:
# order already submitted/accepted - no action required
return
# report executed order
if order.status in [order.Completed]:
if order.isbuy():
self.log(f'BUY EXECUTED --- Price: {order.executed.price:.2f}, Cost: {order.executed.value:.2f},Commission: {order.executed.comm:.2f}'
)
self.price = order.executed.price
self.comm = order.executed.comm
else:
self.log(f'SELL EXECUTED --- Price: {order.executed.price:.2f}, Cost: {order.executed.value:.2f},Commission: {order.executed.comm:.2f}'
)
# report failed order
elif order.status in [order.Canceled, order.Margin,
order.Rejected]:
self.log('Order Failed')
# set no pending order
self.order = None
def notify_trade(self, trade):
if not trade.isclosed:
return
self.log(f'OPERATION RESULT --- Gross: {trade.pnl:.2f}, Net: {trade.pnlcomm:.2f}')
def next(self):
pass
def next_open(self):
self.operate()
def operate(self):
if self.order is not None:
return
# 如果没有持仓
if not self.position:
if self.data_predicted > 0:
size = int(self.broker.getcash() * 0.95 / self.datas[0].open) // 100 * 100
#size = 5000
self.buy(size=size)
self.log(f'买入 --- 份额:{size}, 资金:{self.broker.getcash():.2f}, 开盘价格:{self.data_open[0]:.2f}, 收盘价格:{self.data_close[0]:.2f}')
else:
if self.data_predicted < 0:
self.sell(size=self.position.size)
self.log(f'卖出 - 份额:{self.position.size}')
cerebro = bt.Cerebro(stdstats = False, cheat_on_open=True)
# 数据加载
datafeed = My_PandasData(dataname=bt_data_prices)
cerebro.adddata(datafeed, name=ts_code)
# 初始化资金
start_value = 100000.0
cerebro.broker.setcash(start_value)
# 设置佣金
comminfo = ChStockCommission(
miss5=True,
commission = 0.00015,
stamp_duty = 0,
transfer_fee = 0,
transaction_fee = 0
)
cerebro.broker.addcommissioninfo(comminfo)
# 添加策略
cerebro.addstrategy(MLStrategy)
# 添加分析器
## 回测时需要添加 PyFolio 分析器
cerebro.addanalyzer(bt.analyzers.PyFolio, _name='pyfolio')
# 开始回测
print('期初资产: %.2f' % cerebro.broker.getvalue())
result = cerebro.run()
print('期末资产: %.2f' % cerebro.broker.getvalue())
回测分析
import pyfolio as pf
strat = result[0]
pyfoliozer = strat.analyzers.getbyname('pyfolio')
returns, positions, transactions, gross_lev = pyfoliozer.get_pf_items()
# get performance statistics for strategy
pf.show_perf_stats(returns)
Start date | 2021-04-07 | |
---|---|---|
End date | 2024-07-04 | |
Total months | 37 | |
Backtest | ||
Annual return | 3.769% | |
Cumulative returns | 12.25% | |
Annual volatility | 23.962% | |
Sharpe ratio | 0.27 | |
Calmar ratio | 0.10 | |
Stability | 0.03 | |
Max drawdown | -36.571% | |
Omega ratio | 1.06 | |
Sortino ratio | 0.42 | |
Skew | 0.69 | |
Kurtosis | 4.37 | |
Tail ratio | 1.26 | |
Daily value at risk | -2.993% |
benchmark_data = pd.read_csv('./datas/399300.SZ.csv',parse_dates=['trade_date'])
benchmark_data = benchmark_data.set_index('trade_date')
benchmark_data.index = benchmark_data.index.tz_localize('UTC')
benchmark_data['returns'] = np.log(benchmark_data['close']/benchmark_data['close'].shift(1))
benchmark_rets = benchmark_data[['returns']].loc[returns.index[0]:returns.index[-1]]
benchmark_rets = benchmark_rets.rename(columns={'returns':'399300.SZ'})
benchmark_rets = benchmark_rets['399300.SZ']
# plot performance for strategy vs benchmark
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 9),constrained_layout=True)
axes = ax.flatten()
pf.plot_drawdown_periods(returns=returns, ax=axes[0])
axes[0].grid(True)
pf.plot_rolling_returns(returns= returns,
factor_returns= benchmark_rets,
ax=axes[1], title='Strategy vs 399300.SZ')
axes[1].grid(True)
pf.plot_drawdown_underwater(returns=returns, ax=axes[2])
axes[2].grid(True)
pf.plot_rolling_sharpe(returns=returns, ax=axes[3])
axes[3].grid(True)
# fig.suptitle('Strategy vs Nifty-50 (Buy and Hold)', fontsize=16, y=0.990)
plt.grid(True)
plt.legend()
plt.tight_layout()
#plt.savefig('images/chart3', dpi=300)
# plot performance for strategy vs benchmark
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 9),constrained_layout=True)
axes = ax.flatten()
pf.plot_rolling_beta(returns=returns, factor_returns=benchmark_rets, ax=axes[0])
axes[0].grid(True)
pf.plot_rolling_volatility(returns=returns, factor_returns=benchmark_rets,ax=axes[1])
axes[1].grid(True)
pf.plot_annual_returns(returns=returns, ax=axes[2])
axes[2].grid(True)
pf.plot_monthly_returns_heatmap(returns=returns, ax=axes[3],)
# fig.suptitle('BL Portfolio vs BSE-SENSEX - 2', fontsize=16, y=1.0)
plt.tight_layout()
#plt.savefig('images/chart4', dpi=300)
总之,通常矢量化回测结果在纸面上看起来不错,但是在我们决定实施此类策略之前,我们需要考虑实现不足和可行性的各个方面。另外,请记住,资本市场不仅仅是机器学习,否则所有数据科学家现在都已经变得超级富有。
原文地址:https://blog.csdn.net/vipsh2011/article/details/143063923
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!