import numpy as np
import httpx
from selectolax.parser import HTMLParser
from dataclasses import dataclass, asdict
import re
import csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
sns.set_theme()
@dataclass
class Player:
name: str
position: str
age : int
club : str
matches : int
goals : int
assists : int
subOn : int
subOff: int
value : float
def parse_players(html):
results=[]
for x in {'odd', 'even'}:
players= html.css("tr."+x)
for player in players:
new_data=Player(name= player.css_first("td.hauptlink").text(),
position= player.css("tr")[2].text(),
age= player.css("td.zentriert")[1].text(),
club= player.css("a")[2].attributes.get('title', ''),
matches= player.css("td.zentriert")[4].text(),
goals= player.css("td.zentriert")[5].text(),
assists= player.css("td.zentriert")[7].text(),
subOn= player.css("td.zentriert")[11].text(),
subOff= player.css("td.zentriert")[12].text(),
value= re.findall("\d+\.\d+", player.css_first("td.rechts").text())[0]
)
results.append(asdict(new_data))
return(results)
def to_csv(results):
with open("results.csv", "a", encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames = ["name", "position","age", "club", "matches",\
"goals", "assists", "subOn", "subOff", "value"])
writer.writerows(results)
def get_html(page):
url = f"https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=alle&jahrgang=0&kontinent_id=0&plus=1&page={page}"
resp = httpx.get(url)
return HTMLParser(resp.text)
@dataclass
class Team:
club: str
country: str
def parse_teams(html):
results=[]
for x in {'odd', 'even'}:
teams= html.css("tr."+x)
for team in teams:
new_data=Team(club= team.css("a")[0].attributes.get('title', ''),
country=team.css("img")[1].attributes.get('title', '')
)
results.append(asdict(new_data))
return(results)
def to_clubcsv(results):
with open("clubs.csv", "a", encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames = ["club", "country"])
writer.writerows(results)
def get_club_html(page):
url = f"https://www.transfermarkt.us/uefa/klubrangliste/statistik/stat/page/{page}/"
resp = httpx.get(url)
return HTMLParser(resp.text)
#Looping over the pages to collect data:
for page in range (1,10):
html=get_html(page)
res= parse_players(html)
to_csv(res)
for page in range (1,5):
html=get_club_html(page)
res= parse_teams(html)
to_clubcsv(res)
#Read datafiles:
#Read player file that has players informations
df=pd.read_csv('results.csv', sep=',', names=["name", "position","age", "club", "matches", "goals", "assists", "subOn",\
"subOff", "value"] )
#Read club file that has clubs informations
clubs=pd.read_csv("clubs.csv", sep=',', names=["club", "country"] )
#Merging both files
data=df.merge(clubs, how='left', on='club')
data.sort_values("value",ascending=False, inplace=True)
data=data.reset_index(drop=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 225 entries, 0 to 224 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 225 non-null object 1 position 225 non-null object 2 age 225 non-null int64 3 club 225 non-null object 4 matches 225 non-null int64 5 goals 225 non-null int64 6 assists 225 non-null int64 7 subOn 225 non-null int64 8 subOff 225 non-null int64 9 value 225 non-null float64 10 country 200 non-null object dtypes: float64(1), int64(6), object(4) memory usage: 19.5+ KB
data[data.country.isna()]
name | position | age | club | matches | goals | assists | subOn | subOff | value | country | |
---|---|---|---|---|---|---|---|---|---|---|---|
63 | Bruno Guimarães | Defensive Midfield | 25 | Newcastle United | 35 | 4 | 5 | 3 | 11 | 60.0 | NaN |
73 | Moisés Caicedo | Defensive Midfield | 21 | Brighton & Hove Albion | 40 | 1 | 1 | 2 | 5 | 55.0 | NaN |
84 | Ivan Toney | Centre-Forward | 27 | Brentford FC | 35 | 21 | 5 | 2 | 4 | 50.0 | NaN |
87 | Alexander Isak | Centre-Forward | 23 | Newcastle United | 29 | 11 | 3 | 9 | 17 | 50.0 | NaN |
113 | Sven Botman | Centre-Back | 23 | Newcastle United | 39 | 0 | 0 | 3 | 2 | 45.0 | NaN |
116 | Amadou Onana | Defensive Midfield | 21 | Everton FC | 34 | 1 | 1 | 3 | 13 | 42.0 | NaN |
117 | Alexis Mac Allister | Central Midfield | 24 | Brighton & Hove Albion | 39 | 11 | 3 | 4 | 9 | 42.0 | NaN |
142 | Anthony Gordon | Left Winger | 22 | Newcastle United | 31 | 3 | 1 | 14 | 11 | 40.0 | NaN |
147 | James Ward-Prowse | Central Midfield | 28 | Southampton FC | 42 | 9 | 3 | 1 | 1 | 38.0 | NaN |
150 | Joelinton | Attacking Midfield | 26 | Newcastle United | 37 | 8 | 3 | 3 | 3 | 38.0 | NaN |
154 | Miguel Almirón | Right Winger | 29 | Newcastle United | 39 | 11 | 3 | 6 | 26 | 35.0 | NaN |
156 | João Palhinha | Defensive Midfield | 27 | Fulham FC | 41 | 4 | 1 | 4 | 8 | 35.0 | NaN |
165 | Allan Saint-Maximin | Left Winger | 26 | Newcastle United | 26 | 1 | 5 | 13 | 10 | 35.0 | NaN |
181 | Marc Guéhi | Centre-Back | 22 | Crystal Palace | 38 | 1 | 0 | 1 | 1 | 35.0 | NaN |
188 | Douglas Luiz | Central Midfield | 24 | Aston Villa | 36 | 5 | 5 | 4 | 8 | 35.0 | NaN |
195 | Joachim Andersen | Centre-Back | 26 | Crystal Palace | 35 | 1 | 0 | 0 | 3 | 32.0 | NaN |
197 | Ollie Watkins | Centre-Forward | 27 | Aston Villa | 36 | 15 | 6 | 2 | 9 | 32.0 | NaN |
201 | Jacob Ramsey | Central Midfield | 21 | Aston Villa | 38 | 5 | 6 | 8 | 18 | 32.0 | NaN |
203 | Robert Sánchez | Goalkeeper | 25 | Brighton & Hove Albion | 26 | 0 | 0 | 0 | 1 | 32.0 | NaN |
207 | Brennan Johnson | Attacking Midfield | 21 | Nottingham Forest | 45 | 12 | 3 | 6 | 14 | 30.0 | NaN |
208 | Eberechi Eze | Attacking Midfield | 24 | Crystal Palace | 36 | 8 | 3 | 10 | 15 | 30.0 | NaN |
215 | Oihan Sancet | Attacking Midfield | 23 | Athletic Bilbao | 37 | 8 | 2 | 4 | 28 | 30.0 | NaN |
216 | Gabri Veiga | Attacking Midfield | 20 | Celta de Vigo | 36 | 9 | 4 | 10 | 22 | 30.0 | NaN |
218 | Morgan Gibbs-White | Attacking Midfield | 23 | Nottingham Forest | 38 | 5 | 7 | 3 | 9 | 30.0 | NaN |
222 | Cheick Doucouré | Defensive Midfield | 23 | Crystal Palace | 33 | 0 | 3 | 0 | 18 | 30.0 | NaN |
#Filling missing data
data.loc[210,'country']='Spain'
data.loc[213,'country']='Spain'
data.fillna('England', inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 225 entries, 0 to 224 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 225 non-null object 1 position 225 non-null object 2 age 225 non-null int64 3 club 225 non-null object 4 matches 225 non-null int64 5 goals 225 non-null int64 6 assists 225 non-null int64 7 subOn 225 non-null int64 8 subOff 225 non-null int64 9 value 225 non-null float64 10 country 225 non-null object dtypes: float64(1), int64(6), object(4) memory usage: 19.5+ KB
#All position in our data:
set(data['position'])
{'Attacking Midfield', 'Central Midfield', 'Centre-Back', 'Centre-Forward', 'Defensive Midfield', 'Goalkeeper', 'Left Winger', 'Left-Back', 'Right Winger', 'Right-Back', 'Second Striker'}
#We take only attackers
data=data[data['position'].isin(['Attacking Midfield',
'Centre-Forward',
'Left Winger',
'Right Winger',
'Second Striker'])]
#We create a new variable: goal Contribution Ratio, which is a measure the contribution of a player to their team's goals.
# It is calculated by dividing the number of goals and assists a player has by the total number of games played.
data['goalContributionRatio']=(data['goals']+data['assists'])/data['matches']
data.head(5)
name | position | age | club | matches | goals | assists | subOn | subOff | value | country | goalContributionRatio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Kylian Mbappé | Centre-Forward | 24 | Paris Saint-Germain | 45 | 39 | 10 | 4 | 10 | 180.0 | France | 1.088889 |
1 | Erling Haaland | Centre-Forward | 22 | Manchester City | 51 | 57 | 9 | 2 | 23 | 170.0 | England | 1.294118 |
2 | Vinicius Junior | Left Winger | 22 | Real Madrid | 52 | 22 | 20 | 2 | 18 | 120.0 | Spain | 0.807692 |
4 | Bukayo Saka | Right Winger | 21 | Arsenal FC | 52 | 15 | 13 | 9 | 20 | 110.0 | England | 0.538462 |
5 | Jamal Musiala | Attacking Midfield | 20 | Bayern Munich | 49 | 15 | 14 | 13 | 30 | 110.0 | Germany | 0.591837 |
#Visualizing categorical data
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.countplot(data=data, y='country', order = data['country'].value_counts().index)
plt.title('Number of Players playing at each league',fontsize=24)
plt.subplot(1,2,2)
plt2=plt.pie(data['club'].value_counts(), labels=data['club'].value_counts().index, autopct='%.0f%%')
plt.title('Clubs of top valued players',fontsize=24)
plt.tight_layout()
plt.show()
#Make it beautiful and add titles
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
plt1=sns.boxplot(x='position', y='value', data=data)
plt.title('Distribution of players value by position',fontsize=24)
plt.subplot(1,2,2)
plt2=sns.boxplot(x='country', y='value', data=data)
plt.title('Distribution of players value by league',fontsize=24)
plt.tight_layout()
plt.show()
data.drop(['position','club'], axis=1, inplace=True)
#Visualize correlation matrix
df=data.copy()
labelencoder = LabelEncoder()
df['country'] = labelencoder.fit_transform(df['country'])
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, linewidths=.1, cmap="RdBu");
sns.pairplot(df);
y=df.value
X=df[['matches','age','country','goalContributionRatio']]
X=df.drop(['name','value'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline = Pipeline([('std_scalar', StandardScaler())])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
reg = LinearRegression()
reg.fit(X_train, y_train)
LinearRegression()
from sklearn import metrics
from sklearn.model_selection import cross_val_score
def cross_val(model):
pred = cross_val_score(model, X, y, cv=10)
return pred.mean()
def print_evaluate(true, predicted):
mae = metrics.mean_absolute_error(true, predicted)
mse = metrics.mean_squared_error(true, predicted)
rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
r2_square = metrics.r2_score(true, predicted)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2 Square', r2_square)
print('--------------------------------')
def evaluate(true, predicted):
mae = metrics.mean_absolute_error(true, predicted)
mse = metrics.mean_squared_error(true, predicted)
rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
r2_square = metrics.r2_score(true, predicted)
return mae, mse, rmse, r2_square
test_pred = reg.predict(X_test)
train_pred = reg.predict(X_train)
print('Model Evaluation:\n--------------------------------')
print_evaluate(y_test, test_pred)
Model Evaluation: -------------------------------- MAE: 18.869821333901264 MSE: 504.7084958543172 RMSE: 22.465718235888147 R2 Square 0.4997574197805914 --------------------------------
sns.regplot(x=y_test, y=test_pred,line_kws={'lw':1,'color': '#FF4500' ,'linestyle':'-.'}, marker="o");
sns.residplot(y_test,test_pred);
C:\Users\youss\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
sns.displot(y_test-test_pred, kde=True);