import pandas as pd
nba_data = pd.read_csv('./player_data.csv')
print(nba_data)
print(nba_data.columns)
In the above code we simply view the Player Data and the columns associated with the Data Set. We are outputted with the head and the tail end of the data. The information is difficult to read and therefore must be preprocessed before analyzed. We can also see the immense length of the data set as there are 4550 rows of information. Therefore we must limit the rows being analyzed as 4550 will require too much time and energy.
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
nba_data = pd.read_csv('./player_data.csv', nrows=15)
nba_data.drop(['birth_date', 'college'], axis=1, inplace=True)
nba_data.dropna(inplace=True)
# Encode categorical variables
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
enc.fit(nba_data[['position']])
onehot = enc.transform(nba_data[['position']])
cols = ['position_' + val for val in enc.get_feature_names(['position'])]
nba_data[cols] = pd.DataFrame(onehot, columns=cols)
nba_data.drop(['position'], axis=1, inplace=True)
nba_data['height'] = nba_data['height'].apply(lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(nba_data)
# Calculate the average height and weight
avg_height = nba_data['height'].mean()
avg_weight = nba_data['weight'].mean()
# Print the average height and weight
print(f"Average height: {round(avg_height, 2)} inches")
print(f"Average weight: {round(avg_weight, 2)} pounds")
In the above code I was able to process the information and drop 2 of the columns to shorten the data. I was also able to encode the categorical variables in order to view the data better. Finally I converted the height and weight column to inches and pounds so that it could be compared and so that I can take the average. I found that when working on this data set, it needed a counter as it was not able to iterate through 4550 rows. To fix this I had to set a limit of the first 100 rows.
import pandas as pd
# Load data
season_data = pd.read_csv('./Seasons_Stats.csv')
# Drop unnecessary columns
season_data = season_data.drop('Unnamed: 0', axis=1)
# Fill missing values
season_data = season_data.fillna(0)
# Convert data types
season_data['Year'] = season_data['Year'].astype(int)
# Print cleaned data
print(season_data)
print(season_data.columns)
The above code is the Season Data form 1951 to Present. Similar to the previous data set we can see that we have an extrordianry amount of rows and therefore must process the data in order to understand and analyze it.
import pandas as pd
pd.set_option('display.max_rows', None)
# Load data
season_data = pd.read_csv('./Seasons_Stats.csv')
# Drop unnecessary columns
season_data = season_data.drop('Unnamed: 0', axis=1)
# Fill missing values
season_data = season_data.fillna(0)
# Convert data types
season_data['Year'] = season_data['Year'].astype(int)
# Organize data by greatest points per game
ppg_data = season_data[['Player', 'Pos', 'Tm', 'Year', 'PTS', 'G']].groupby(['Player', 'Pos', 'Tm', 'Year']).sum().reset_index()
ppg_data['PPG'] = round(ppg_data['PTS'] / ppg_data['G'], 1)
ppg_data = ppg_data.sort_values(by=['PPG'], ascending=False)
print("Top 10 Leaders in PPG:")
print(ppg_data[['Player', 'Pos', 'Tm', 'Year', 'PPG']].head(10))
# Organize data by greatest steals per game
spg_data = season_data[['Player', 'Pos', 'Tm', 'Year', 'STL', 'G']].groupby(['Player', 'Pos', 'Tm', 'Year']).sum().reset_index()
spg_data['SPG'] = round(spg_data['STL'] / spg_data['G'], 1)
spg_data = spg_data.sort_values(by=['SPG'], ascending=False)
print("\nTop 10 Leaders in SPG:")
print(spg_data[['Player', 'Pos', 'Tm', 'Year', 'SPG']].head(10))
# Organize data by greatest assists per game
apg_data = season_data[['Player', 'Pos', 'Tm', 'Year', 'AST', 'G']].groupby(['Player', 'Pos', 'Tm', 'Year']).sum().reset_index()
apg_data['APG'] = round(apg_data['AST'] / apg_data['G'], 1)
apg_data = apg_data.sort_values(by=['APG'], ascending=False)
print("\nTop 10 Leaders in APG:")
print(apg_data[['Player', 'Pos', 'Tm', 'Year', 'APG']].head(10))
# Organize data by greatest blocks per game
bpg_data = season_data[['Player', 'Pos', 'Tm', 'Year', 'BLK', 'G']].groupby(['Player', 'Pos', 'Tm', 'Year']).sum().reset_index()
bpg_data['BPG'] = round(bpg_data['BLK'] / bpg_data['G'], 1)
bpg_data = bpg_data.sort_values(by=['BPG'], ascending=False)
print("\nTop 10 Leaders in BPG:")
print(bpg_data[['Player', 'Pos', 'Tm', 'Year', 'BPG']].head(10))
# Organize data by greatest rebounds per game
rpg_data = season_data[['Player', 'Pos', 'Tm', 'Year', 'TRB', 'G']].groupby(['Player', 'Pos', 'Tm', 'Year']).sum().reset_index()
rpg_data['RPG'] = round(rpg_data['TRB'] / rpg_data['G'], 1)
rpg_data = rpg_data.sort_values(by=['RPG'], ascending=False)
print("\nTop 10 Leaders in RPG:")
print(rpg_data[['Player', 'Pos', 'Tm', 'Year', 'RPG']].head(10))
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
season_data = pd.read_csv('./Seasons_Stats.csv')
# Drop unnecessary columns
season_data = season_data.drop('Unnamed: 0', axis=1)
# Fill missing values
season_data = season_data.fillna(0)
# Convert data types
season_data['Year'] = season_data['Year'].astype(int)
# Filter data to include only relevant columns
df = season_data[['Year', 'PTS', 'AST', 'TRB']]
# Compute correlations between variables
corr = df.corr()
# Create linear regression plot for points and assists
sns.lmplot(x='PTS', y='AST', data=df)
# Set title and axes labels
plt.title('Points vs. Assists')
plt.xlabel('Points')
plt.ylabel('Assists')
# Show the plot
plt.show()
I tried to have a graph with Assists and Points to see the correlation between Great Scorers and Great Team Players. However this does not help at all as this graph is simply a clump of data. I would like to learn more about how I can analyze the data and create a more accurate graph.