Availability of data from other cities is inconsistent.
Targeting freeway segments within the City of Los Angeles will allow both available datasets to be used since both agencies have jurisdiction there.
Data from California Highway Patroll was mixed with data from other agencies and therefore omitted
With Metro Line:¶
- 110 between Downtown LA and Pasadena - Gold Line
- 10 between Downtown LA and Santa Monica - Expo Line
- 101 between Downtown LA and North Hollywood - Red Line #### Without Metro Line:
- 405 Sepulveda Pass
- 118 between 405 and Ventura County Line
- 5 between Downtown LA and the 134
# Import Python Libraries that will be used.
import pandas as pd
import geopandas
import numpy as np
import matplotlib as plt
import geocoder
import re
import datetime
%matplotlib inline
DUI Code list is compiled by searching for DMV violation codes (https://www.dmv.ca.gov/portal/dmv/detail/dl/vioptct) as well as references to "Alcohol", "DUI" and "Influence" in charge descriptions.
dui_code_list = ['23152(G)VC', '191.5(B)PC', '23152(C)', '23550.5AVC', '23152(B)', '23152(D)', '23152(F)', '23224(B)VC', '23550(A)VC', '23152(F)VC', '655(B)HN', '23105(A)VC', '23175(A)VC', '23136(A)VC', '23152(A)', '14601.2B.VC', '23152(B)VC', '23153(E)VC', '23153(A)', '21200.5VC', '23221VC', '23152(E)VC', '23153(B)', '191.5(A)PC', '23550VC', '23153(G)VC', '23153(B)VC', '23152(A)VC', '655(B)HNC', '192(C)(3)PC', '14601.2A.VC', '23153(A)VC', '23140(A)VC', '23153(F)VC', '23152(G)']
# Datasets are ommitted from the Repo due to size.
# They are available at:
# https://data.lacounty.gov/Public-Safety/LASD-Officer-Contacts-Incident-Details/mwug-nk2r
# https://data.lacounty.gov/Public-Safety/LASD-Officer-Contacts-Person-Details/wraf-ix3i
# Import County Data
person_details = pd.read_csv("./datasets/CountyOfLA/LASD_Officer_Contacts_-_Person_Details.csv")
# Import incident details with parse_dates enabled.
incident_details = pd.read_csv("./datasets/CountyOfLA/LASD_Officer_Contacts_-_Incident_Details.csv", parse_dates = [2], infer_datetime_format = True)
# Person Details contains the offence code
person_details.head()
# Incident Details contains Date and Time
incident_details.head()
# Join based on Contact ID
sheriff_data = person_details.merge(incident_details, left_on = 'Contact ID', right_on='CONTACT_ID')
sheriff_data.shape
# The Sheriff's Data contains many more columns than the ones we are interested in.
print(list(sheriff_data.columns))
# Limit Columns to relevant ones
sheriff_data_focused = sheriff_data[['Contact ID','Person ID', 'DATE_TIME','Street #', 'Street Direction', 'Street Name',\
'Street Type', 'Suite', 'Cross Street', 'Landmark', 'Full Address', 'CITY', 'STATE', 'ZIP_CODE',\
'Reason for Contact', 'Reason for Contact Narrative', 'Traffic Violation Type',\
'Traffic Violation Offense Code','Result of Contact: In-field Cite and Release?',\
'Result of Contact: In-field Cite and Release Offense Codes?',\
'Result of Contact: Custodial Arrest Pursuant to Outstanding Warrant ?',\
'Result of Contact: Custodial Arrest Without Warrant?', \
'Result of Contact: Custodial Arrest Without Warrant Offense Codes?']]
# Creating empty columns to store date and time as datetime objects.
sheriff_data_focused["Date"] = np.nan
sheriff_data_focused["Time"] = np.nan
# Functions to separate date and time from timestamp
def populate_date(row):
timestamp = row["DATE_TIME"]
row["Date"] = pd.to_datetime([str(timestamp.month) + "/" + str(timestamp.day) + "/" + str(timestamp.year)])
return row
def populate_time(row):
timestamp = row["DATE_TIME"]
row["Time"] = ((timestamp.hour * 100) + timestamp.minute)
return row
sheriff_data_focused = sheriff_data_focused.apply(populate_date, axis = 1)
sheriff_data_focused = sheriff_data_focused.apply(populate_time, axis = 1)
# What is the distribution of all arrests through the day?
sheriff_data_focused.hist(column = "Time", bins = 24)
sheriff_data_focused.shape
sheriff_data_focused["DATE_TIME"].max()
sheriff_data_focused["DATE_TIME"].min()
# Create empty column to store DUI offence code
sheriff_data_focused["Relevant_Offence_Code"] = np.nan
sheriff_data_focused["Result of Contact: Custodial Arrest Without Warrant Offense Codes?"].unique()
# Store copy of first row for function testing
sheriff_data_test = sheriff_data_focused.iloc[0]
# Function to split combined codes and return in list
def split_combined_codes(string):
code_list = []
string = str(string)
if "," in string:
split_string = string.split(",")
code_list = code_list + split_string
else:
code_list.append(string)
return code_list
# Function to check all relevant columns for offence codes
# If one of them is a DUI code it populates the new "Relevant Offense Code" column
def find_dui_offence_codes(row):
code_list = []
traffic_code = row["Traffic Violation Offense Code"]
code_list= code_list + split_combined_codes(traffic_code)
cite_and_release_code = row["Result of Contact: In-field Cite and Release Offense Codes?"]
code_list= code_list + split_combined_codes(cite_and_release_code)
custodial_arrest_code = row["Result of Contact: Custodial Arrest Without Warrant Offense Codes?"]
code_list = code_list + split_combined_codes(custodial_arrest_code)
for code in code_list:
if code in dui_code_list:
row["Relevant_Offence_Code"] = code
return row
else:
continue
return row
find_dui_offence_codes(sheriff_data_test)
# Apply funtions to the full dataset.
sheriff_data_with_dui_codes = sheriff_data_focused.apply(find_dui_offence_codes, axis = 1)
sheriff_data_with_dui_codes.count()
# Filter to rows that have data in the "Relevant Offence Code" Column
sheriff_data_with_dui_codes = sheriff_data_with_dui_codes[sheriff_data_with_dui_codes.Relevant_Offence_Code.notnull()]
sheriff_data_with_dui_codes.shape
# Drop duplicate entries from the same incident
sheriff_data_with_dui_codes.drop_duplicates(subset="Contact ID", keep='first', inplace=True)
sheriff_data_with_dui_codes.shape
sheriff_data_with_dui_codes["Date"] = np.nan
sheriff_data_with_dui_codes["Time"] = np.nan
# Functions to separate date and time from timestamp
def populate_date(row):
timestamp = row["DATE_TIME"]
row["Date"] = (str(timestamp.month) + "/" + str(timestamp.day) + "/" + str(timestamp.year))
return row
def populate_time(row):
timestamp = row["DATE_TIME"]
row["Time"] = ((timestamp.hour * 100) + timestamp.minute)
return row
sheriff_data_with_dui_codes = sheriff_data_with_dui_codes.apply(populate_date, axis = 1)
sheriff_data_with_dui_codes = sheriff_data_with_dui_codes.apply(populate_time, axis = 1)
sheriff_data_with_dui_codes.head()
#sheriff_data_with_dui_codes_test = sheriff_data_with_dui_codes.iloc[[0]]
print(sheriff_data_with_dui_codes["DATE_TIME"].dtype)
sheriff_data_with_dui_codes.hist(column = "Time", bins = 24)
sheriff_data_focused.hist(column = "Time", bins = 24)
# Create columns to store Latitude and Longitude values.
sheriff_data_with_dui_codes["Lat"] = np.nan
sheriff_data_with_dui_codes["Long"] = np.nan
# Latitude and Longitude range for the boundaries of LA County.
# Latitude 33.6 - 34.8
# Longitude -117.6 - -118.9
# Geocoder was returning clearly wrong values for some data, using ranges to confirm that results are reasonable.
def get_lat_long(row, address = "Full Address", city = "CITY", state = "STATE"):
string_list = []
if row[address]:
string_list.append(row[address])
if row[city]:
string_list.append(row[city])
# Specifying LA County resulted in much better results from the Geocoder
string_list.append("Los Angeles County")
if row[state]:
string_list.append(row[state])
separator = ","
full_string = separator.join(string_list)
g = geocoder.arcgis(full_string)
# Get latlng attribute from g
# latlng is a list with two elements
lat_lng_list = g.latlng
#print(lat_lng_list)
# Create default values for latitude and longitude
latitude = 0.0
longitude = 0.0
# see if value returned by geocoder is within LA County
# If so, replace default value with it
if 33.6 <= lat_lng_list[0] <= 35.0:
latitude = lat_lng_list[0]
if -119.0 <= lat_lng_list[1] <= -117.6:
longitude = lat_lng_list[1]
# Populate the Lat and Long columns in the row
#latitude = lat_lng_list[0]
#longitude = lat_lng_list[1]
row["Lat"] = latitude
row["Long"] = longitude
return row
sheriff_data_with_dui_codes = sheriff_data_with_dui_codes.apply(get_lat_long, axis = 1)
# Look at data using Describe
# Check to be sure the min in Lat and Long show no 0.0 values
sheriff_data_with_dui_codes.describe()
# Pull DUI data into a GeoDataFrame
dui_gdf = geopandas.GeoDataFrame(
sheriff_data_with_dui_codes, geometry=geopandas.points_from_xy(sheriff_data_with_dui_codes.Long, sheriff_data_with_dui_codes.Lat))
# Convert from Geographic Coordinates to projected
dui_gdf.crs = {'init' :'epsg:4326'}
dui_gdf = dui_gdf.to_crs({'init': 'epsg:2229'})
print(dui_gdf.crs)
# Pull county outline from http://boundaries.latimes.com/sets/
county = geopandas.read_file("http://s3-us-west-2.amazonaws.com/boundaries.latimes.com/archive/1.0/boundary-set/counties-2012.geojson")
county = county.to_crs({'init': 'epsg:2229'})
print(county.crs)
# Load city data from local file
city = geopandas.read_file("./datasets/GISData/City Boundaries.geojson")
city = city.to_crs({'init': 'epsg:2229'})
print(city.crs)
dui_gdf.head()
# City data includes non-Land features.
city['feat_type'].unique()
# Limit County Outline to just LA
la_county = county[county["name"] == "Los Angeles"]
# Limit city data to Los Angeles and Land
city_of_los_angeles = city[(city["city_name"] == "Los Angeles") & (city["feat_type"] == "Land")]
#city_of_los_angeles.shape
# Combine multipoligon features into one for comparison
la_boundary = city_of_los_angeles.geometry.unary_union
print(dui_gdf.crs)
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
#la_county.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 2,ax=ax)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
dui_gdf.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
dui_gdf.shape
city_dui_gdf = dui_gdf[dui_gdf["geometry"].within(la_boundary)]
#from geopandas.tools import sjoin
#city_dui_gdf = sjoin(dui_gdf, la_boundary, how='left', op='within')
city_dui_gdf.shape
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
#la_county.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 2,ax=ax)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
city_dui_gdf.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
# Index Cleanup to Avoid issues when comparing to other datasets
city_dui_gdf.reset_index(inplace=True)
# Drop duplicates from Contact ID field since there might be multiple people involved in an incident
city_dui_gdf.drop_duplicates(subset="Contact ID", keep='first', inplace=True)
#print(list(city_dui_gdf["Contact ID"]))
sheriff_dui_data_for_merge = city_dui_gdf[['Date', 'Time', 'Relevant_Offence_Code', 'Lat', 'Long', 'geometry']]
# fill np array with desired string. Use length of df to define array length
#len(list(sheriff_dui_data_for_merge['geometry']))
#sheriff_dui_data_for_merge['Source'] = np.full(len(list(sheriff_dui_data_for_merge['geometry'])), "LA_County_Sheriff").tolist()
#source_column = np.full(len(list(sheriff_dui_data_for_merge['geometry'])), "LA_County_Sheriff")
sheriff_dui_data_for_merge = sheriff_dui_data_for_merge.assign(Source="LA_County_Sheriff")
sheriff_dui_data_for_merge.head()
# Dataset is omitted from the Repository due to size.
# Public download available at:
# https://data.lacity.org/resource/yru6-6re4.json
# Import LAPD data into Data Frame
# Convert text dates to datetime
lapd_raw_data = pd.read_csv("./datasets/CityOfLA/Arrest_Data_from_2010_to_Present.csv",parse_dates = [1], infer_datetime_format = True)
lapd_raw_data.shape
lapd_raw_data.head()
lapd_raw_data["Arrest Date"].max()
lapd_raw_data["Arrest Date"].min()
# Filter data to Charge Codes in dui_code_list
lapd_dui_data = lapd_raw_data[lapd_raw_data["Charge"].isin(dui_code_list)]
lapd_dui_data.shape
#Filter Data to same timeframe as Sheriff's data.
lapd_dui_data_during_sheriff = lapd_dui_data[(lapd_dui_data["Arrest Date"]>=datetime.date(2018,7,1))&(lapd_dui_data["Arrest Date"]<= datetime.date(2018,12,31))]
#df[(df['date']>datetime.date(2016,1,1)) & (df['date']<datetime.date(2016,3,1))]
lapd_dui_data_during_sheriff.shape
# Location Column is storing data as text.
# We need to separate the Latitude and Longitude to work with it.
type(lapd_dui_data["Location"].iloc[0])
# Create new columns to store the values
lapd_dui_data['Lat'] = np.nan
lapd_dui_data['Long'] = np.nan
lapd_dui_data = lapd_dui_data.assign(STATE="California")
lapd_dui_data = lapd_dui_data.assign(Source="Los_Angeles_Police_Department")
lapd_dui_data.head()
# Regex to distinguish Latitude from Longitude in the string.
# This Regex will only work for our dataset since it uses the negative Longitude to help distinguish.
lat = re.compile('\d{2}.\d*')
long = re.compile('-\d*.\d*')
# Checks if Cross Street field is populated
# If so, combines with Address field and returns
# Otherwise returns original row
def combine_address_with_cross(row):
if pd.notnull(row["Cross Street"]):
combined_text = str(row["Address"]) + " and " + str(row["Cross Street"])
row["Address"] = combined_text
return row
else:
return row
# Checks if Location field has a null Lat/Long
# If so, runs combine_address_with_cross then geocodes result
# Returns geocoded row
def geocode_only_nulls(row):
if row["Location"] == "(0.0, 0.0)":
updated_row = combine_address_with_cross(row)
geocoded_row = get_lat_long(updated_row, address = "Address", city = "Area Name", state = "STATE")
return geocoded_row
else:
return row
# Takes string location and searches for Lat Long using regex
# Checks to be sure result is within LA County
def split_lat_long(row):
lat_long_str = row["Location"]
latitude = lat.search(lat_long_str)
if latitude:
if 33.6 <= float(latitude.group()) <= 35.0:
row["Lat"] = latitude.group()
else:
row["Lat"] = "No Match"
longitude = long.search(lat_long_str)
if longitude:
if -119.0 <= float(longitude.group()) <= -117.6:
row["Long"] = longitude.group()
else:
row["Long"] = "No Match"
return row
lapd_dui_data_with_lat_long = lapd_dui_data.apply(split_lat_long, axis=1)
lapd_dui_data_with_lat_long.head()
#lapd_dui_data_test = lapd_dui_data.iloc[:4]
#print(lapd_dui_data_test)
#lapd_dui_data_with_lat_long = lapd_dui_data_test.apply(split_lat_long, axis=1)
#lapd_dui_data_with_lat_long.head()
lapd_dui_data_with_null_location = lapd_dui_data_with_lat_long[lapd_dui_data_with_lat_long["Location"] == "(0.0, 0.0)"]
lapd_dui_data_with_null_location.shape
# Use the function created above for this situation.
lapd_dui_data_with_lat_long = lapd_dui_data_with_lat_long.apply(geocode_only_nulls, axis = 1)
# Filter Data Frame to any rows that still have "No Match"
check_for_missing = lapd_dui_data_with_lat_long[(lapd_dui_data_with_lat_long["Lat"] == "No Match") | (lapd_dui_data_with_lat_long["Long"] == "No Match")]
check_for_missing.head()
check_for_missing = lapd_dui_data_with_lat_long[lapd_dui_data_with_lat_long["Lat"].isnull()]
check_for_missing.head()
def convert_lat_long_to_float(row):
row["Lat"] = float(row["Lat"])
row["Long"] = float(row["Long"])
return row
# Make sure Latitude and Longitude values are float(numeric) rather than string(text)
lapd_dui_data_with_lat_long = lapd_dui_data_with_lat_long.apply(convert_lat_long_to_float, axis = 1)
lapd_dui_data_with_lat_long.head()
# Convert dataframe to geodataframe
lapd_dui_gdf = geopandas.GeoDataFrame(
lapd_dui_data_with_lat_long, geometry=geopandas.points_from_xy(lapd_dui_data_with_lat_long.Long, lapd_dui_data_with_lat_long.Lat))
# Convert Lat/Long data to projected coordinates
lapd_dui_gdf.crs = {'init' :'epsg:4326'}
lapd_dui_gdf = lapd_dui_gdf.to_crs({'init': 'epsg:2229'})
# Plot points relative to LA County and City
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
lapd_dui_gdf.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
# Filter points to just those in the City
lapd_dui_gdf_in_city = lapd_dui_gdf[lapd_dui_gdf["geometry"].within(la_boundary)]
lapd_dui_gdf_in_city.shape
# Plot points relative to LA County and City
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
lapd_dui_gdf_in_city.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
la_freeways_and_ramps = geopandas.read_file("./datasets/GISData/LosAngelesFreewaysAndRamps.geojson")
la_freeways_and_ramps["geometry"] = la_freeways_and_ramps.simplify(50)
la_freeways_and_ramps.shape
la_freeways_and_ramps.head()
la_freeways_and_ramps["Type"].unique()
la_freeways_and_ramps.crs = {'init' :'epsg:2229'}
#{'init' :'epsg:4326'}
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
la_freeways_and_ramps.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
freeway_segments_buffers = geopandas.read_file("./datasets/GISData/150_ft_buffers.geojson")
#freeway_segments_for_analysis.crs = {'init' :'epsg:4326'}
#freeway_segments_for_analysis = freeway_segments_for_analysis.to_crs({'init' :'epsg:2229'})
freeway_segments_buffers.head(6)
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
freeway_segments_buffers.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
lapd_duis_in_buffers = geopandas.sjoin(lapd_dui_gdf_in_city, freeway_segments_buffers, how='left', op='within', lsuffix='left', rsuffix='right')
#lapd_duis_in_buffers = lapd_dui_gdf_in_city.sjoin()
#geopandas.sjoin(left_df, right_df, how='inner', op='intersects', lsuffix='left', rsuffix='right')
lapd_duis_in_buffers.shape
lapd_duis_in_buffers = lapd_duis_in_buffers[lapd_duis_in_buffers["SourceID"].notnull()]
print(lapd_duis_in_buffers.shape)
lapd_duis_in_buffers.head()
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
lapd_duis_in_buffers.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
lapd_duis_in_buffers.columns
# Convert back to regular DataFrame
lapd_duis_in_buffers_df = pd.DataFrame(lapd_duis_in_buffers.drop(columns='geometry'))
# Use layer field to group points by freeway segment
lapd_counts_by_segment = lapd_duis_in_buffers_df.groupby(['layer']).count()
# Get count of reports for each segment
lapd_counts_by_segment = lapd_counts_by_segment.sort_values(by=['Report ID'])
lapd_counts_by_segment.head(6)
lapd_counts_by_segment["Report ID"].plot(kind='bar')
# Pull traffic count points into GeoDataFrame
traffic_count_points = geopandas.read_file("./datasets/GISData/LosAngelesTrafficCountPoints2017.geojson")
traffic_count_points.crs = {'init' :'epsg:2229'}
traffic_count_points.head()
# Combine traffic counts for each direction into a total count.
traffic_count_points["Total_AADT"] = traffic_count_points["Back_AADT"] + traffic_count_points["Ahead_AADT"]
traffic_count_points.head()
ax = la_county.geometry.boundary.plot(figsize=(20, 20),color=None,edgecolor='k',linewidth = 2)
city_of_los_angeles.geometry.boundary.plot(color=None,edgecolor='lightblue',linewidth = 2,ax=ax)
traffic_count_points.plot(figsize=(20, 20), alpha=0.5, edgecolor='green', ax = ax)
# Join Traffic Count Points to buffers
traffic_counts_in_buffers = geopandas.sjoin(traffic_count_points, freeway_segments_buffers, how='left', op='within', lsuffix='left', rsuffix='right')
traffic_counts_in_buffers = traffic_counts_in_buffers[traffic_counts_in_buffers["layer"].notnull()]
traffic_counts_in_buffers.head()
# Convert back to regular dataframe
# Calculate mean values for traffic count points in each buffer
traffic_counts_in_buffers_df = pd.DataFrame(traffic_counts_in_buffers.drop(columns='geometry'))
traffic_count_by_segment = traffic_counts_in_buffers_df.groupby(['layer']).mean()
traffic_count_by_segment.head()
dui_with_traffic_counts = traffic_count_by_segment.merge(lapd_counts_by_segment, left_on = 'layer', right_on = 'layer')
dui_with_traffic_counts["dui_per_thousand_cars"] = dui_with_traffic_counts["Report ID"]/(dui_with_traffic_counts["Total_AADT"] / 1000)
dui_with_traffic_counts = dui_with_traffic_counts.sort_values(by=['dui_per_thousand_cars'])
dui_with_traffic_counts["dui_per_thousand_cars"]
dui_with_traffic_counts["dui_per_thousand_cars"].plot(kind='bar')