大数据分析——对世界杯比赛进行分析及预测

分类: 365bet开户官网 发布时间: 2025-07-08 01:36:04
作者: admin 阅读: 8351 | 点赞: 715
大数据分析——对世界杯比赛进行分析及预测

2.导入第三方库

3.加载三个数据文件,并删除缺失值

4.数据分析

简单的分析一下世界杯的冠军情况

5.数据预处理

接着创建一个字典用来储存球队名称

删除不必要的列同时计算每支队伍成为冠军的次数

定义一个函数用于找出获胜队伍

将team-name字典中的团队名称替换为id

5.准备建模用到的X,y数据

打乱数据

6.模型构建

用svm支持向量机模型进行训练并打印模型的评估指标

决策树模型更加准确 ,最终选择决策树算法作为模型选择。

7.模型预测

这里使用2022年卡塔尔世界杯最后半决赛部分来检测模型效果

预测正确

预测正确

预测正确

可惜,预测失败

(四)实验源代码

import numpy as np import pandas as pd import seaborn as snsimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitfrom sklearn.utils import shufflefrom sklearn.metrics import classification_report, confusion_matrixfrom sklearn.svm import SVCimport warningswarnings.filterwarnings('ignore')# 导入数据matches = pd.read_csv('WorldCupMatches.csv')players = pd.read_csv('WorldCupPlayers.csv')cups = pd.read_csv('WorldCupsSummary.csv')# 删除缺失值matches = matches.dropna()players = players.dropna()cups = cups.dropna() matches.head()matches.head()players.head()# 世界杯赛冠军的数量plt.figure(figsize=(12,6))sns.countplot(x='Winner',data=cups)plt.show()# 用德国取代德国DR和德国FR,用俄罗斯取代苏联def replace_name(df): if(df['Home Team Name'] in ['German DR', 'Germany FR']): df['Home Team Name'] = 'Germany' elif(df['Home Team Name'] == 'Soviet Union'): df['Home Team Name'] = 'Russia' if(df['Away Team Name'] in ['German DR', 'Germany FR']): df['Away Team Name'] = 'Germany' elif(df['Away Team Name'] == 'Soviet Union'): df['Away Team Name'] = 'Russia' return df matches = matches.apply(replace_name, axis='columns')matches.head()# 创建一个存储足球队的字典team_name = {}index = 0for idx, row in matches.iterrows(): name = row['Home Team Name'] if(name not in team_name.keys()): team_name[name] = index index += 1 name = row['Away Team Name'] if(name not in team_name.keys()): team_name[name] = index index += 1 team_name# 删除不必要的列dropped_matches = matches.drop(['Datetime', 'Stadium', 'Referee', 'Assistant 1', 'Assistant 2', 'RoundID', 'Home Team Initials', 'Away Team Initials', 'Half-time Home Goals', 'Half-time Away Goals', 'Attendance', 'City', 'MatchID', 'Stage'], 1)# 计算每支球队成为世界杯赛冠军的次数championships = cups['Winner'].map(lambda p: 'Germany' if p=='Germany FR' else p).value_counts()championships# 加上“主队冠军”和“客场冠军”:获取世界杯冠军的次数dropped_matches['Home Team Championship'] = 0dropped_matches['Away Team Championship'] = 0 def count_championship(df): if(championships.get(df['Home Team Name']) != None): df['Home Team Championship'] = championships.get(df['Home Team Name']) if(championships.get(df['Away Team Name']) != None): df['Away Team Championship'] = championships.get(df['Away Team Name']) return df dropped_matches = dropped_matches.apply(count_championship, axis='columns')dropped_matches.head()# 定义一个函数用于找出谁赢了:主场胜:1,客场胜:2,平局:0dropped_matches['Winner'] = '-'def find_winner(df): if(int(df['Home Team Goals']) == int(df['Away Team Goals'])): df['Winner'] = 0 elif(int(df['Home Team Goals']) > int(df['Away Team Goals'])): df['Winner'] = 1 else: df['Winner'] = 2 return df dropped_matches = dropped_matches.apply(find_winner, axis='columns')dropped_matches.head()# 将team_name字典中的团队名称替换为iddef replace_team_name_by_id(df): df['Home Team Name'] = team_name[df['Home Team Name']] df['Away Team Name'] = team_name[df['Away Team Name']] return df teamid_matches = dropped_matches.apply(replace_team_name_by_id, axis='columns')teamid_matches.head()# 删除不必要的列teamid_matches = teamid_matches.drop(['Year', 'Home Team Goals', 'Away Team Goals'], 1)teamid_matches.head()X = teamid_matches[['Home Team Name', 'Away Team Name', 'Home Team Championship','Away Team Championship']]X = np.array(X).astype('float64')# 附加数据:只需将“主队名称”替换为“客场球队名称”,将“主队冠军”替换为“客场球队冠军”,然后替换结果_X = X.copy()_X[:,0] = X[:,1]_X[:,1] = X[:,0]_X[:,2] = X[:,3]_X[:,3] = X[:,2]y = dropped_matches['Winner']y = np.array(y).astype('int')y = np.reshape(y,(1,850))y = y[0]_y = y.copy()for i in range(len(_y)): if(_y[i]==1): _y[i] = 2 elif(_y[i] ==2): _y[i] = 1 X = np.concatenate((X,_X), axis= 0)y = np.concatenate((y,_y))print(X)print(y)# 打乱数据,然后拆分数据集为训练集和测试集X,y = shuffle(X,y)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)# 用SVM支持向量机模型进行训练svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)svm_model.fit(X, y)print("Predicting on the test set")y_pred = svm_model.predict(X_test)print(svm_model.score(X_test,y_test))print(classification_report(y_test, y_pred))print(confusion_matrix(y_test, y_pred, labels=range(3)))# 构建决策树模型from sklearn.tree import DecisionTreeClassifiertree_model = DecisionTreeClassifier()tree_model.fit(X, y)print("Predicting on the test set")y_pred = tree_model.predict(X_test)print(tree_model.score(X_test,y_test))print(classification_report(y_test, y_pred))print(confusion_matrix(y_test, y_pred))# 定义一个预测函数,需要传递两个球队名称,输出两个获胜的概率def prediction(team1, team2): id1 = team_name[team1] id2 = team_name[team2] championship1 = championships.get(team1) if championships.get(team1) != None else 0 championship2 = championships.get(team2) if championships.get(team2) != None else 0 x = np.array([id1, id2, championship1, championship2]).astype('float64') x = np.reshape(x, (1,-1)) _y = svm_model.predict_proba(x)[0] text = ('Chance for '+team1+' to win '+team2+' is {}\nChance for '+team2+' to win '+team1+' is {}\nChance for '+team1+' and '+team2+' draw is {}').format(_y[1]*100,_y[2]*100,_y[0]*100) return _y[0], text# 预测英格兰对法国的比赛prob, text = prediction('England', 'France')print(text)# 预测阿根廷对克罗地亚的比赛prob, text = prediction('Argentina', 'Croatia')print(text)# 预测法国对摩洛哥的比赛prob, text = prediction('France', 'Morocco')print(text)# 预测克罗地亚对摩洛哥的比赛prob, text = prediction('Croatia', 'Morocco')print(text)# 预测阿根廷对法国的比赛prob, text = prediction('Argentina','France')print(text)

(五)总结:通过这次python实战,我学到了许多新知识,丰富了经验,缩小了实践和理论的差距。在今后的生活中,我将继续学习不断提升理论涵养,深入实践,提供自身综合素质。