import requests
import json
import geopandas as gpd
import pandas as pd
import contextily as ctx
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

plt.style.use('ggplot')


r  = requests.get('https://controllerdata.lacity.org/resource/g9h8-fvhu.json?$limit=10000000')
payroll = pd.DataFrame(json.loads(r.content))
payroll2022 = payroll[payroll['pay_year']=='2022']
payroll2022


pay_cols = ['regular_pay', 'overtime_pay', 'all_other_pay', 'total_pay', 'city_retirement_contributions', 'benefit_pay']

# Convert columns to numeric
payroll2022[pay_cols] = payroll2022[pay_cols].apply(pd.to_numeric)
payroll2022.dtypes

/var/folders/c7/zpty205x53s48zvfyzq_xx740000gn/T/ipykernel_49987/1172670025.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  payroll2022[pay_cols] = payroll2022[pay_cols].apply(pd.to_numeric)

record_nbr                        object
pay_year                          object
last_name                         object
first_name                        object
fms_dept                          object
fms_department_title              object
job_class_pgrade                  object
job_title                         object
employment_type                   object
job_status                        object
mou                               object
mou_title                         object
regular_pay                      float64
overtime_pay                     float64
all_other_pay                    float64
total_pay                        float64
city_retirement_contributions    float64
benefit_pay                      float64
gender                            object
ethnicity                         object
dtype: object


aggregations = {
    'last_name': 'last',       
    'first_name': 'last',  
    'fms_dept': 'last', 
    'fms_department_title':'last',
    'job_class_pgrade':'last',
    'job_title':'last',
    'employment_type':'last',
    'job_status':'last',
    'mou':'last',
    'mou_title':'last',         
    'regular_pay': 'sum',
    'overtime_pay': 'sum',
    'all_other_pay': 'sum',
    'total_pay':'sum',
    'city_retirement_contributions':'sum',
    'benefit_pay':'sum',
    'gender':'last',
    'ethnicity':'last'
}

# Group by Unique ID and apply aggregations
merged2022 = payroll2022.groupby('record_nbr').agg(aggregations).reset_index()

merged2022


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

# Specify the categorical columns
categorical_columns = ['fms_department_title', 'job_class_pgrade', 'job_title', 'employment_type', 'job_status', 'mou_title', 'gender', 'ethnicity']

# Specify the target variable
yvar = 'total_pay'

# Drop rows with missing values in the selected columns
df_to_fit = merged2022[categorical_columns + [yvar]].dropna()

# Apply label encoding to categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    df_to_fit[col] = label_encoder.fit_transform(df_to_fit[col])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_to_fit[categorical_columns], df_to_fit[yvar], test_size=0.25, random_state=1)

# RandomForestRegressor training and prediction
rf = RandomForestRegressor(n_estimators=50, random_state=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Check the sizes of the training and testing sets
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

45624 45624
15208 15208


import seaborn as sns
import numpy

importances = rf.feature_importances_

# convert to a series, and give the index labels from our X_train dataframe
forest_importances = pd.Series(importances, index=X_train.columns)

# get the standard deviations to be able to plot the error bars
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

# sort the importances in descending order
forest_importances.sort_values(inplace=True, ascending=False)

# plot
fig, ax = plt.subplots(figsize=(4,15))
sns.barplot(x=forest_importances.values[:10], y=forest_importances.index[:10],yerr=std[:10], ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")

Text(0, 0.5, 'Mean decrease in impurity')


gender_fulltime = merged2022[merged2022['employment_type']=='FULL_TIME']
genderft = gender_fulltime.groupby(['gender'])['record_nbr'].count().reset_index()

gender_partime = merged2022[merged2022['employment_type']=='PART_TIME']
genderpt = gender_partime.groupby(['gender'])['record_nbr'].count().reset_index()

fig = make_subplots(rows=1, cols=2, subplot_titles=['Full Time', 'Part Time'])

fig.add_trace(go.Bar(x=genderft['gender'], y=genderft['record_nbr'], name='Full Time', hovertemplate='Gender: %{x} <br>Number of Employees: %{y:,.0f}', text=genderft['record_nbr'], textposition='outside'), row=1, col=1)
fig.add_trace(go.Bar(x=genderpt['gender'], y=genderpt['record_nbr'], name='Part Time', hovertemplate='Gender: %{x} <br>Number of Employees: %{y:,.0f}', text=genderpt['record_nbr'], textposition='outside'), row=1, col=2)

fig.update_layout(
    title='Number of Employees by Gender',
    xaxis_title='Gender Breakdown',
    yaxis_title='Number of Employees'
)


gender_fulltime = merged2022[merged2022['employment_type']=='FULL_TIME']
genderft = gender_fulltime.groupby(['gender'])['total_pay'].mean().reset_index().round(2)

gender_partime = merged2022[merged2022['employment_type']=='PART_TIME']
genderpt = gender_partime.groupby(['gender'])['total_pay'].mean().reset_index().round(2)

fig = make_subplots(rows=1, cols=2, subplot_titles=['Full Time', 'Part Time'])

fig.add_trace(go.Bar(x=genderft['gender'], y=genderft['total_pay'], name='Full Time', hovertemplate='Gender: %{x} <br>Average Pay: $%{y:,.0f}'), row=1, col=1)
fig.add_trace(go.Bar(x=genderpt['gender'], y=genderpt['total_pay'], name='Part Time', hovertemplate='Gender: %{x} <br>Average Pay: $%{y:,.0f}'), row=1, col=2)

fig.update_layout(
    title='Average Total Pay Comparison by Gender',
    xaxis_title='Gender Breakdown',
    yaxis_title='Total Pay ($)'
)


top100 = merged2022.sort_values('total_pay', ascending=False).iloc[:100]
top100viz = top100.groupby(['gender'])['record_nbr'].count().reset_index()

fig = px.pie(top100viz, values='record_nbr', names='gender', title='2022 Gender Breakdown of City Workforce (Top 100 Earning Employees)', labels={'record_nbr':'Number of Employees'})
fig.show()


eth_fulltime = merged2022[merged2022['employment_type']=='FULL_TIME']
ethft = eth_fulltime.groupby(['ethnicity'])['record_nbr'].count().reset_index()

fig = px.pie(ethft, values='record_nbr', names='ethnicity', title='2022 Ethnic Breakdown of City Workforce (Full-Time)', labels={'record_nbr':'Number of Employees'})
fig.show()


eth_fulltime = merged2022[merged2022['employment_type']=='FULL_TIME']
ethft = eth_fulltime.groupby(['ethnicity'])['total_pay'].mean().reset_index().round(0).sort_values('total_pay',ascending=True)

fig = px.bar(ethft, x='ethnicity', y='total_pay')

# Customize the hover text
fig.update_traces(hovertemplate='Ethnicity: %{x} <br>Average Pay: $%{y:,.2f}')

# Update the layout if needed
fig.update_layout(
    title='Average Total Pay by Ethnicity (Full-Time)',
    xaxis_title='Ethnicity',
    yaxis_title='Total Pay'
)

# Show the plot
fig.show()


eth_parttime = merged2022[merged2022['employment_type']=='PART_TIME']
ethpt = eth_parttime.groupby(['ethnicity'])['record_nbr'].count().reset_index()

fig = px.pie(ethpt, values='record_nbr', names='ethnicity', title='2022 Ethnic Breakdown of City Workforce (Part-Time)', labels={'record_nbr':'Number of Employees'})
fig.show()


eth_parttime = merged2022[merged2022['employment_type']=='PART_TIME']
ethpt = eth_parttime.groupby(['ethnicity'])['total_pay'].mean().reset_index().round(0).sort_values('total_pay',ascending=True)

fig = px.bar(ethpt, x='ethnicity', y='total_pay')

# Customize the hover text
fig.update_traces(hovertemplate='Ethnicity: %{x} <br>Average Pay: $%{y:,.2f}')

# Update the layout if needed
fig.update_layout(
    title='Average Total Pay by Ethnicity (Part-Time)',
    xaxis_title='Ethnicity',
    yaxis_title='Total Pay'
)

# Show the plot
fig.show()


top100 = merged2022.sort_values('total_pay', ascending=False).iloc[:100]
top100viz = top100.groupby(['ethnicity'])['record_nbr'].count().reset_index()

fig = px.pie(top100viz, values='record_nbr', names='ethnicity', title='2022 Ethnic Breakdown of City Workforce (Top 100 Earning Employees)', labels={'record_nbr':'Number of Employees'})
fig.show()

	record_nbr	pay_year	last_name	first_name	fms_dept	fms_department_title	job_class_pgrade	job_title	employment_type	job_status	mou	mou_title	regular_pay	overtime_pay	all_other_pay	total_pay	city_retirement_contributions	benefit_pay	gender	ethnicity
301000	303033353533	2022	OJEDA MACHADO	JESSICA	88	RECREATION AND PARKS	2423-1	AQUATIC FACILITY MANAGER I	FULL_TIME	ACTIVE	20	SUPERVISORY ADMINISTRATIVE	2656.0	0.0	0.0	2656.0	787.77	339.85	FEMALE	HISPANIC
301001	303034333035	2022	HERNANDEZ	JESSE	82	PUBLIC WORKS - SANITATION	4110-1	WASTEWATER COLLECTION WORKER I	FULL_TIME	ACTIVE	4	EQUIPMENT OPERATION AND LABOR	4358.4	0.0	0.0	4358.4	1292.7	0.0	MALE	NaN
301002	303034393631	2022	JENKINS	TERRI	38	FIRE	1360-0	OFFICE SERVICES ASSISTANT	FULL_TIME	ACTIVE	3	CLERICAL	7716.0	0.0	922.77	8638.77	2288.57	1415.04	FEMALE	BLACK
301003	343035353733	2022	DUNN	CASEY	38	FIRE	2112-3	FIREFIGHTER III	FULL_TIME	ACTIVE	23	FIREFIGHTERS AND FIRE CAPTAINS	114409.91	71911.9	9121.4	195443.21	53601.04	18245.7	MALE	CAUCASIAN
301004	343035353833	2022	NAKAMURA	ROSS	70	POLICE	3711-5	EQUIPMENT MECHANIC	FULL_TIME	ACTIVE	14	SERVICE AND CRAFT	93537.86	1177.22	8173.33	102888.41	27743.33	7999.2	MALE	ASIAN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
416993	333733373532	2022	WILLIAMS	MATTHEW	98	WATER AND POWER	3987-5	WTRWKS MCHC SUPV	FULL_TIME	ACTIVE	B	SUPERVISORY BLUE COLLAR UNIT	121781.84	35789.67	22771.45	180342.96	8038	23807.53	MALE	CAUCASIAN
416994	3332353733	2022	JONES	ANTHONY	98	WATER AND POWER	3987-5	WTRWKS MCHC SUPV	FULL_TIME	ACTIVE	B	SUPERVISORY BLUE COLLAR UNIT	134592.80	10657.20	8987.03	154237.03	8883	25998.89	MALE	BLACK
416995	373630323832	2022	HUNTER	JOHN	98	WATER AND POWER	3987-5	WTRWKS MCHC SUPV	FULL_TIME	ACTIVE	B	SUPERVISORY BLUE COLLAR UNIT	138112.01	33485.53	9987.40	181584.94	9115	48630.41	MALE	CAUCASIAN
416996	3730323332	2022	HASKETT	TORAKICHI	98	WATER AND POWER	3987-5	WTRWKS MCHC SUPV	FULL_TIME	ACTIVE	B	SUPERVISORY BLUE COLLAR UNIT	137612.33	24458.18	11381.88	173452.39	9082	22766.49	MALE	CAUCASIAN
416997	313737303733	2022	RODRIGUEZ	JAVIER	98	WATER AND POWER	2358-5	X-RAY & LAB TECHNICIAN	FULL_TIME	ACTIVE	2	TECHNICAL REPRESENTATION UNIT	83158.40	0	3511.64	86670.04	0	25998.89	MALE	HISPANIC

	record_nbr	last_name	first_name	fms_dept	fms_department_title	job_class_pgrade	job_title	employment_type	job_status	mou	mou_title	regular_pay	overtime_pay	all_other_pay	total_pay	city_retirement_contributions	benefit_pay	gender	ethnicity
0	303030303034	ARRAZCAETA	CHRISTOPHE	88	RECREATION AND PARKS	2498-0	RECREATION ASSISTANT	PART_TIME	ACTIVE	7	RECREATION ASSISTANTS	0.00	0.00	1000.00	1000.00	0.00	0.00	MALE	OTHER
1	3030303032	VILLEGAS	JOSE	44	LIBRARY	1172-1	LIBRARY ASSISTANT I	FULL_TIME	ACTIVE	3	CLERICAL	75849.73	0.00	5504.45	81354.18	22497.03	8490.24	MALE	HISPANIC
2	303030303233	ESCOBEDO	JOSE	70	POLICE	2214-2	POLICE OFFICER II	FULL_TIME	NOT_ACTIVE	24	POLICE OFFICERS, LIEUTENANT AND BELOW	119886.53	28064.72	5072.50	153023.75	56166.84	20489.01	MALE	HISPANIC
3	3030303035	BAUTISTA	RONALD	88	RECREATION AND PARKS	3141-0	GARDENER CARETAKER	FULL_TIME	ACTIVE	4	EQUIPMENT OPERATION AND LABOR	61992.88	45.90	3126.18	65164.96	18387.09	17559.12	MALE	HISPANIC
4	3030303036	WOO	MICHELLE	98	WATER AND POWER	9105-5	UTILITY ADMINISTRATOR	FULL_TIME	ACTIVE	M	MANAGEMENT EMPLOYEES UNIT	173304.00	17952.90	11679.85	202936.75	11438.00	10209.85	FEMALE	ASIAN AMERICAN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
62402	CTNA987578	SARABIA	ALFREDO	98	WATER AND POWER	995-1	CONSTR ELTL MCHC	FULL_TIME	NOT_ACTIVE	Z	DAILY RATE	76577.80	110.10	42019.71	118707.61	0.00	0.00	MALE	HISPANIC
62403	CTNA989600	GILLETT	EBONI	98	WATER AND POWER	3181-2	SECTY OFCR	FULL_TIME	NOT_ACTIVE	0	SECURITY UNIT	21020.69	4899.50	1167.12	27087.31	0.00	6010.24	FEMALE	BLACK
62404	CTNA992617	COMMINEY	HEROD	98	WATER AND POWER	1611-1	MTR RDR	FULL_TIME	NOT_ACTIVE	8	OPERATING MAINTENANCE AND SERVICE UNIT	15312.00	949.03	20.00	16281.03	0.00	4044.64	MALE	BLACK
62405	CTNA994155	ZARATE	LAWRENCE	98	WATER AND POWER	1110-1	UTILITY PRE CRAFT TRAINEE	FULL_TIME	NOT_ACTIVE	Z	DAILY RATE	17356.29	15.13	17669.34	35040.76	0.00	0.00	MALE	HISPANIC
62406	CTNA996518	RODRIGUEZ VEJAR	JUAN	98	WATER AND POWER	3344-5	CRPNTR	FULL_TIME	NOT_ACTIVE	8	OPERATING MAINTENANCE AND SERVICE UNIT	3990.40	1446.52	30.00	5466.92	0.00	0.00	MALE	HISPANIC

Analysis of City of Los Angeles Employee Payrolls (2022 Payroll Data)¶

Table of Contents:¶

Part I: Importing Libraries and Datasets ¶

Part II: Data Management and Cleanup ¶

Part III: Using ML to Predict What Influences Total Pay¶

Part IV: Analysis by Gender ¶

Gender Breakdown of City's Top 100 Earning Workers¶

Part V: Analysis by Ethnicity ¶

Full-Time Workers¶

Part-Time Workers¶

Ethnic Breakdown of City's Top 100 Earning Workers¶

Part VI: Recommendations & Conclusions ¶