NCAA - Simple Rating System

This is calculated based on formula from https://www.pro-football-reference.com/.
Details can be found here.

This is method 2 from https://github.com/eyalshafran/NCAA-Kaggle/blob/master/Teams%20Strength.ipynb

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
from sklearn.linear_model import Ridge,LogisticRegression,Lasso
from sklearn.metrics import log_loss
import seaborn as sns
pd.options.display.max_rows = 200

%matplotlib inline

Load Data

In [3]:
files = glob.glob('google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/*')
file_dict = {f.split("\\")[-1].split(".")[0]:f for i,f in enumerate(files)}

Season Stats

Load team file, regular season gamelogs and tournament gamelogs

In [5]:
teams = pd.read_csv(file_dict['MTeams'],usecols=['TeamID','TeamName'])
SeasonResults = pd.read_csv(file_dict['MRegularSeasonCompactResults'])
In [6]:
SeasonResultsWin = SeasonResults[['Season','DayNum','WTeamID','WScore','LTeamID','LScore','WLoc']].copy()
SeasonResultsWin.columns = ['Season','DayNum','TeamID','Score','TeamID_OPP','Score_OPP','WLoc']
SeasonResultsWin['result'] = 1
SeasonResultsLose = SeasonResults[['Season','DayNum','WTeamID','WScore','LTeamID','LScore','WLoc']].copy()
SeasonResultsLose.columns = ['Season','DayNum','TeamID_OPP','Score_OPP','TeamID','Score','WLoc']
SeasonResultsLose['result'] = 0
SeasonResultsTeams = pd.concat([SeasonResultsWin,SeasonResultsLose],axis=0,sort=True)

SeasonResultsTeams['WL'] = np.where(SeasonResultsTeams['Score']-SeasonResultsTeams['Score_OPP']>0,'W','L')

Tournament Stats

In [7]:
TourneyCompactResults = pd.read_csv(file_dict['MNCAATourneyCompactResults'])
TourneyCompactResults['TeamID1'] = np.minimum(TourneyCompactResults['WTeamID'],TourneyCompactResults['LTeamID'])
TourneyCompactResults['TeamID2'] = np.maximum(TourneyCompactResults['WTeamID'],TourneyCompactResults['LTeamID'])
TourneyCompactResults['result'] = np.where(TourneyCompactResults['WTeamID']==TourneyCompactResults['TeamID1'],1,0)
TourneyCompactResults['ID'] = TourneyCompactResults['Season'].astype(str)+ '_' +TourneyCompactResults['TeamID1'].astype(str)+ '_' +TourneyCompactResults['TeamID2'].astype(str)

Create SRS Calculations Function

In [8]:
def SOS(data,seasons,alpha=0.0008,reg_type='Lasso',zscore=True,days = 133):
    """Find the true team strength by adjusting for opponent and average margin of victory. 
    Computes the strength based on a single season.
    The regression is based on an m*m matrix where m is the number of teams.
    Input:
    data - pandas data frame with same structure as SeasonResultsTeams
    files
    seasons - a list/array of seasons to compute the strength for
    alpha (default = 0.008) - the regularization parameter
    reg_type (default = Lasso) - 'Lasso', 'Ridge' or 'Logistic'
    days (default = 133) - only includes games with DayNum < days. 
    zscore (default = False) - if True computes the zscore for each season
    Example:
    team_strength = SOS(SeasonResults,np.arange(1985,2020),alpha=0.001,reg_type='Lasso')
    """
    
    if reg_type == 'Lasso':
        lin = Lasso(alpha=alpha);
    elif reg_type == 'Ridge':
        lin = Ridge(alpha=alpha);
    else:
        print('reg_type is not recognized. Using Lasso.')
        lin = Lasso(alpha=alpha);
        
    srs = []
    for season in seasons:
        single_season = data.loc[(data['Season']==season)&(data['DayNum']<=days)].copy()
        team_results =  single_season.groupby(['TeamID','TeamID_OPP']).size().unstack(1).fillna(0)
        team_results = -(team_results.T/team_results.sum(axis=1).T).T

        outcomes = single_season.groupby('TeamID')[['Score','Score_OPP']].mean()
        outcomes['scoreDiff'] = outcomes['Score'] - outcomes['Score_OPP']
        y = outcomes['scoreDiff'].values

        A = team_results.values
        np.fill_diagonal(A, 1)
        lin.fit(A,y)
        srs_per_team = lin.coef_
        single_season_srs = pd.DataFrame(srs_per_team,index=team_results.index,columns=['SRS']).reset_index()
        single_season_srs['MOV'] = y
        single_season_srs['Season'] = season
        single_season_srs['SOS'] = single_season_srs['SRS'] - single_season_srs['MOV']
        if zscore:
            single_season_srs['SRS'] = (single_season_srs['SRS'] - single_season_srs['SRS'].mean())/single_season_srs['SRS'].std()
        srs.append(single_season_srs)
    return pd.concat(srs,ignore_index=True)

Hypertune Regularization Parameter

Using cross validation we are going to see what is the best regularization parameter for the SRS to be predictive in the NCAA tournament

In [9]:
def hypertune_model(reg_type,alphas,zscores):
    # Define predictive model for the trounament with low regularization parameter 
    lr = LogisticRegression(solver='lbfgs',C=1000000,random_state=0,max_iter=1500)

    all_scores = []
    for zscore in zscores:
        for alpha in alphas:  
            # get team strength
            team_strength = SOS(SeasonResultsTeams,
                                 np.arange(1985,2020),
                                 alpha=alpha,
                                 reg_type=reg_type,
                                 zscore=zscore)

            # join to tournament data
            TourneyResults = (TourneyCompactResults
                                     .merge(team_strength,left_on=['Season','TeamID1'],right_on=['Season','TeamID'],how='left')
                                     .drop('TeamID',axis=1)
                                     .merge(team_strength,left_on=['Season','TeamID2'],right_on=['Season','TeamID'],how='left')
                                     .drop('TeamID',axis=1)
                                    ).copy()

            TourneyResults['strength_diff'] = TourneyResults['SRS_x'] - TourneyResults['SRS_y']

            cols = ['strength_diff']

            # define model variables
            X = TourneyResults.loc[:,cols]
            y = TourneyResults[['result']].values.ravel()

            # run cross validation
            scores = np.zeros((5,1))
            for ii,s in enumerate(range(2014,2019)):
                idxTrain = (TourneyResults['Season'] < s) 
                idxTest = (TourneyResults['Season'] == s)
                # fit all models
                lr.fit(X.loc[idxTrain],y[idxTrain])

                ypred_lr = lr.predict_proba(X.loc[idxTest])

                scores[ii,0] = log_loss(y[idxTest],ypred_lr[:,1])

            all_scores.append([zscore,alpha,np.mean(scores),np.std(scores)])

    return pd.DataFrame(all_scores,columns=['zscore','alpha','log_loss_mean','log_loss_std'])   

Ridge Regression

Our best results with method 1 was 0.546306. Let's see if we can beat that.

In [10]:
all_scores_df = hypertune_model('Ridge',[0.001,0.01,0.1,1,10,100],[False,True])
In [11]:
# use this to highlight the minimum value - https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]
In [12]:
all_scores_df
all_scores_df.style.apply(highlight_min,subset=['log_loss_mean'])
Out[12]:
zscore alpha log_loss_mean log_loss_std
0 False 0.001 0.548127 0.0414718
1 False 0.01 0.546461 0.0410545
2 False 0.1 0.556322 0.0363844
3 False 1 0.643838 0.0239368
4 False 10 0.670334 0.0173919
5 False 100 0.673286 0.0164135
6 True 0.001 0.548489 0.0420861
7 True 0.01 0.546654 0.0414189
8 True 0.1 0.555952 0.036745
9 True 1 0.644054 0.0250121
10 True 10 0.670826 0.0187734
11 True 100 0.673807 0.0178427

Lasso Regression

In [13]:
all_scores_df2 = hypertune_model('Lasso',[0.0005,0.001,0.002,0.004,0.008,0.016],[False,True])
In [14]:
all_scores_df2
all_scores_df2.style.apply(highlight_min,subset=['log_loss_mean'])
Out[14]:
zscore alpha log_loss_mean log_loss_std
0 False 0.0005 0.545271 0.0408231
1 False 0.001 0.544806 0.0403479
2 False 0.002 0.55065 0.0401083
3 False 0.004 0.578154 0.0380917
4 False 0.008 0.623629 0.0426569
5 False 0.016 0.661755 0.0304696
6 True 0.0005 0.545401 0.041393
7 True 0.001 0.544721 0.0409278
8 True 0.002 0.550253 0.0411535
9 True 0.004 0.577596 0.0400231
10 True 0.008 0.624985 0.0467087
11 True 0.016 0.665857 0.0364726

Our previous best result was 0.546306.
We can see that we got the best results for predicting the NCAA tournament using this method!!!

Another thing to notice is that the zscore did seem to help in this case.

Find the strongest team since 1985*

*If we didn't care about the NCAA tournament we wouldn't use regularization. Since we are trying to find the strength of the teams as a predictive feature for the NCAA tournament we are using the regularization we found with the hypertunning.

In [20]:
teams_sos = SOS(SeasonResultsTeams,np.arange(1985,2020),alpha=0.001,zscore=True)
conf = pd.read_csv(file_dict['MTeamConferences'])
teams_sos = (teams_sos
             .merge(teams[['TeamID','TeamName']],on='TeamID',how='left')
             .merge(conf,on=['TeamID','Season'])
             .sort_values(by='SRS',ascending=False)
             .reset_index(drop=True)
            )
teams_sos[['TeamName','Season','SRS','MOV','SOS','ConfAbbrev']].head(10)
Out[20]:
TeamName Season SRS MOV SOS ConfAbbrev
0 Duke 1999 3.684896 25.424242 8.632206 acc
1 Duke 2001 3.328810 20.848485 9.751051 acc
2 UNLV 1991 3.214135 28.933333 2.372293 big_west
3 Duke 1998 3.209445 21.166667 8.539421 acc
4 Kentucky 1996 3.186182 22.200000 6.955198 sec
5 Gonzaga 2019 3.155749 23.787879 4.230072 wcc
6 Oklahoma 1988 3.143722 23.393939 4.872336 big_eight
7 Kentucky 2015 3.134946 20.941176 6.374745 sec
8 North Carolina 1986 3.097695 18.838710 8.157938 acc
9 Kentucky 1997 3.046292 21.363636 6.629357 sec

We can also see each teams Margin Of Victory (MOV) and Strength Of Schedule (SOS). In fact, the SOS is calculated by SRS (before zscore) - MOV.

I'm going to find the mean SOS for each conference so I can sort my box plot by the mean.

In [24]:
strength_by_conf = (teams_sos[teams_sos['Season']==2019]
                      .groupby('ConfAbbrev')[['SOS']]
                      .mean()
                      .sort_values(by='SOS',ascending=False)
                      .reset_index()
                     )

And we can use the seaborn library to make the box plot

In [42]:
sns.set(font_scale=1.5)
fig,ax = plt.subplots(figsize=(8,10))

ax = sns.boxplot(y="ConfAbbrev",
                 x="SOS",
                 data=teams_sos[teams_sos['Season']==2019],
                 orient='h',
                 order=strength_by_conf['ConfAbbrev'].tolist())

ax.set_xlabel('Strength Of Schedule (SOS)',fontsize=16);
ax.set_ylabel('',fontsize=16);
ax.set_title('Strength Of Schedule By Conference (2019)',fontsize=20);
In [44]:
fig.savefig('Simple Rating System.png',bbox_inches='tight')


Comments

comments powered by Disqus