In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
mean_absolute_error, # MAE
mean_squared_error # MSE
)
In [2]:
flights = pd.read_csv('flights.csv')
/tmp/ipykernel_19417/3005660769.py:1: DtypeWarning: Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.
flights = pd.read_csv('flights.csv')
In [3]:
flights.columns
Out[3]:
Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
dtype='object')
In [4]:
flights.drop('WEATHER_DELAY', axis=1, inplace=True)
flights.drop('AIRLINE', axis=1, inplace=True)
flights.drop('FLIGHT_NUMBER', axis=1, inplace=True)
flights.drop('DIVERTED', axis=1, inplace=True)
flights.drop('CANCELLED', axis=1, inplace=True)
flights.drop('CANCELLATION_REASON', axis=1, inplace=True)
In [5]:
flights.drop('YEAR', axis=1, inplace=True)
flights.drop('MONTH', axis=1, inplace=True)
flights.drop('DAY', axis=1, inplace=True)
In [6]:
flights.drop('DAY_OF_WEEK', axis=1, inplace=True)
In [7]:
flights.drop('ORIGIN_AIRPORT', axis=1, inplace=True)
flights.drop('DESTINATION_AIRPORT', axis=1, inplace=True)
flights.drop('WHEELS_ON', axis=1, inplace=True)
flights.drop('WHEELS_OFF', axis=1, inplace=True)
In [8]:
flights.columns
Out[8]:
Index(['TAIL_NUMBER', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME',
'DEPARTURE_DELAY', 'TAXI_OUT', 'SCHEDULED_TIME', 'ELAPSED_TIME',
'AIR_TIME', 'DISTANCE', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
'LATE_AIRCRAFT_DELAY'],
dtype='object')
In [9]:
flights.drop('TAXI_IN', axis=1, inplace=True)
flights.drop('TAXI_OUT', axis=1, inplace=True)
flights.drop('AIR_SYSTEM_DELAY', axis=1, inplace=True)
flights.drop('SECURITY_DELAY', axis=1, inplace=True)
flights.drop('AIRLINE_DELAY', axis=1, inplace=True)
flights.drop('LATE_AIRCRAFT_DELAY', axis=1, inplace=True)
In [10]:
flights.columns
Out[10]:
Index(['TAIL_NUMBER', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME',
'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME',
'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],
dtype='object')
In [11]:
flights.head()
Out[11]:
| TAIL_NUMBER | SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N407AS | 5 | 2354.0 | -11.0 | 205.0 | 194.0 | 169.0 | 1448 | 430 | 408.0 | -22.0 |
| 1 | N3KUAA | 10 | 2.0 | -8.0 | 280.0 | 279.0 | 263.0 | 2330 | 750 | 741.0 | -9.0 |
| 2 | N171US | 20 | 18.0 | -2.0 | 286.0 | 293.0 | 266.0 | 2296 | 806 | 811.0 | 5.0 |
| 3 | N3HYAA | 20 | 15.0 | -5.0 | 285.0 | 281.0 | 258.0 | 2342 | 805 | 756.0 | -9.0 |
| 4 | N527AS | 25 | 24.0 | -1.0 | 235.0 | 215.0 | 199.0 | 1448 | 320 | 259.0 | -21.0 |
In [12]:
flights.describe()
Out[12]:
| SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.819079e+06 | 5.732926e+06 | 5.732926e+06 | 5.819073e+06 | 5.714008e+06 | 5.714008e+06 | 5.819079e+06 | 5.819079e+06 | 5.726566e+06 | 5.714008e+06 |
| mean | 1.329602e+03 | 1.335204e+03 | 9.370158e+00 | 1.416859e+02 | 1.370062e+02 | 1.135116e+02 | 8.223565e+02 | 1.493808e+03 | 1.476491e+03 | 4.407057e+00 |
| std | 4.837518e+02 | 4.964233e+02 | 3.708094e+01 | 7.521058e+01 | 7.421107e+01 | 7.223082e+01 | 6.077843e+02 | 5.071647e+02 | 5.263197e+02 | 3.927130e+01 |
| min | 1.000000e+00 | 1.000000e+00 | -8.200000e+01 | 1.800000e+01 | 1.400000e+01 | 7.000000e+00 | 2.100000e+01 | 1.000000e+00 | 1.000000e+00 | -8.700000e+01 |
| 25% | 9.170000e+02 | 9.210000e+02 | -5.000000e+00 | 8.500000e+01 | 8.200000e+01 | 6.000000e+01 | 3.730000e+02 | 1.110000e+03 | 1.059000e+03 | -1.300000e+01 |
| 50% | 1.325000e+03 | 1.330000e+03 | -2.000000e+00 | 1.230000e+02 | 1.180000e+02 | 9.400000e+01 | 6.470000e+02 | 1.520000e+03 | 1.512000e+03 | -5.000000e+00 |
| 75% | 1.730000e+03 | 1.740000e+03 | 7.000000e+00 | 1.730000e+02 | 1.680000e+02 | 1.440000e+02 | 1.062000e+03 | 1.918000e+03 | 1.917000e+03 | 8.000000e+00 |
| max | 2.359000e+03 | 2.400000e+03 | 1.988000e+03 | 7.180000e+02 | 7.660000e+02 | 6.900000e+02 | 4.983000e+03 | 2.400000e+03 | 2.400000e+03 | 1.971000e+03 |
In [13]:
flights.isnull().sum()
Out[13]:
TAIL_NUMBER 14721 SCHEDULED_DEPARTURE 0 DEPARTURE_TIME 86153 DEPARTURE_DELAY 86153 SCHEDULED_TIME 6 ELAPSED_TIME 105071 AIR_TIME 105071 DISTANCE 0 SCHEDULED_ARRIVAL 0 ARRIVAL_TIME 92513 ARRIVAL_DELAY 105071 dtype: int64
In [14]:
flights['DEPARTURE_TIME'].value_counts()
Out[14]:
DEPARTURE_TIME
555.0 14829
556.0 13976
557.0 13616
558.0 12954
655.0 12391
...
353.0 6
428.0 6
438.0 4
433.0 3
403.0 3
Name: count, Length: 1440, dtype: int64
In [15]:
flights['DEPARTURE_DELAY'].value_counts()
Out[15]:
DEPARTURE_DELAY
-3.0 455407
-4.0 444053
-5.0 438844
-2.0 435237
-1.0 387475
...
683.0 1
1283.0 1
775.0 1
1314.0 1
1433.0 1
Name: count, Length: 1217, dtype: int64
In [16]:
flights['SCHEDULED_TIME'].value_counts()
Out[16]:
SCHEDULED_TIME
85.0 115062
80.0 112856
75.0 105978
90.0 101926
70.0 96823
...
547.0 1
530.0 1
600.0 1
27.0 1
584.0 1
Name: count, Length: 550, dtype: int64
In [17]:
flights['ELAPSED_TIME'].value_counts()
Out[17]:
ELAPSED_TIME
80.0 47441
79.0 47049
81.0 46966
82.0 46679
78.0 46287
...
735.0 1
726.0 1
719.0 1
709.0 1
697.0 1
Name: count, Length: 712, dtype: int64
In [18]:
flights['AIR_TIME'].value_counts()
Out[18]:
AIR_TIME
64.0 49791
63.0 49760
62.0 49476
65.0 49393
61.0 49215
...
669.0 1
684.0 1
676.0 1
674.0 1
672.0 1
Name: count, Length: 675, dtype: int64
In [19]:
flights['ARRIVAL_TIME'].value_counts()
Out[19]:
ARRIVAL_TIME
1645.0 6490
2051.0 6485
2053.0 6485
2050.0 6482
2056.0 6477
...
406.0 51
409.0 51
332.0 51
316.0 49
339.0 48
Name: count, Length: 1440, dtype: int64
In [20]:
flights['ARRIVAL_DELAY'].value_counts()
Out[20]:
ARRIVAL_DELAY
-8.0 176899
-9.0 176016
-10.0 175232
-7.0 174524
-11.0 171557
...
1528.0 1
1368.0 1
996.0 1
1105.0 1
877.0 1
Name: count, Length: 1240, dtype: int64
In [21]:
flights.columns
Out[21]:
Index(['TAIL_NUMBER', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME',
'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME',
'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],
dtype='object')
In [22]:
null_deptime = flights['DEPARTURE_TIME'].isnull().sum()
In [23]:
len(flights['DEPARTURE_TIME'])
Out[23]:
5819079
In [24]:
null_delay = flights['DEPARTURE_DELAY'].isnull().sum()
In [25]:
null_schedtime = flights['SCHEDULED_TIME'].isnull().sum()
In [26]:
null_elapstime = flights['ELAPSED_TIME'].isnull().sum()
In [27]:
null_airtime = flights['AIR_TIME'].isnull().sum()
In [28]:
null_arrtime = flights['ARRIVAL_TIME'].isnull().sum()
In [29]:
null_arrdelay = flights['ARRIVAL_DELAY'].isnull().sum()
In [30]:
dep_time = null_deptime / len(flights['DEPARTURE_TIME']) * 100
In [31]:
dep_delay = null_delay / len(flights['DEPARTURE_DELAY']) * 100
In [32]:
sched_time = null_schedtime / len(flights['SCHEDULED_TIME']) * 100
In [33]:
elapsed_time = null_elapstime / len(flights['ELAPSED_TIME']) * 100
In [34]:
air_time = null_airtime / len(flights['AIR_TIME']) * 100
In [35]:
arrival_time = null_arrtime / len(flights['ARRIVAL_TIME']) * 100
In [36]:
arrival_delay = null_arrdelay / len(flights['ARRIVAL_DELAY']) * 100
In [37]:
full_len = len(flights)
In [38]:
full_len
Out[38]:
5819079
In [39]:
data = [dep_time, dep_delay, sched_time, elapsed_time, air_time, arrival_time, arrival_delay]
In [40]:
df = pd.DataFrame([data], columns=['Czas odlotu','Opóźnienie odlotu', 'Zaplanowany czas', 'Czas który upłynął',
'Czas lotu', 'Czas przylotu', 'Opóźnienie przylotu'])
In [41]:
plt.style.use('bmh')
In [42]:
df.plot(kind='bar', figsize=(11, 3), title='Wartość procentowa pustych wartości poszczególnych kolumn.')
Out[42]:
<AxesSubplot:title={'center':'Wartość procentowa pustych wartości poszczególnych kolumn.'}>
In [43]:
arrival_delay = flights['ARRIVAL_DELAY']
In [44]:
max_hours_delay = arrival_delay.max() / 60
In [45]:
max_hours_delay
Out[45]:
32.85
In [46]:
min_delay = arrival_delay.min() / 60
In [47]:
min_delay
Out[47]:
-1.45
In [48]:
data2 = [max_hours_delay, min_delay]
In [49]:
df2 = pd.DataFrame([data2], columns=['Maksymalne opóźnienie', 'Minimalne opóźnienie'])
In [50]:
df2.plot(kind='bar', figsize=(6, 3), title='Maksymalne i minimalne opóźnienie w godzinach.')
Out[50]:
<AxesSubplot:title={'center':'Maksymalne i minimalne opóźnienie w godzinach.'}>
In [51]:
flights.dropna(inplace=True)
In [52]:
flights.isnull().sum()
Out[52]:
TAIL_NUMBER 0 SCHEDULED_DEPARTURE 0 DEPARTURE_TIME 0 DEPARTURE_DELAY 0 SCHEDULED_TIME 0 ELAPSED_TIME 0 AIR_TIME 0 DISTANCE 0 SCHEDULED_ARRIVAL 0 ARRIVAL_TIME 0 ARRIVAL_DELAY 0 dtype: int64
In [53]:
flights.head()
Out[53]:
| TAIL_NUMBER | SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N407AS | 5 | 2354.0 | -11.0 | 205.0 | 194.0 | 169.0 | 1448 | 430 | 408.0 | -22.0 |
| 1 | N3KUAA | 10 | 2.0 | -8.0 | 280.0 | 279.0 | 263.0 | 2330 | 750 | 741.0 | -9.0 |
| 2 | N171US | 20 | 18.0 | -2.0 | 286.0 | 293.0 | 266.0 | 2296 | 806 | 811.0 | 5.0 |
| 3 | N3HYAA | 20 | 15.0 | -5.0 | 285.0 | 281.0 | 258.0 | 2342 | 805 | 756.0 | -9.0 |
| 4 | N527AS | 25 | 24.0 | -1.0 | 235.0 | 215.0 | 199.0 | 1448 | 320 | 259.0 | -21.0 |
In [54]:
flights.drop('TAIL_NUMBER',axis=1, inplace=True)
In [55]:
flights.head()
Out[55]:
| SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 2354.0 | -11.0 | 205.0 | 194.0 | 169.0 | 1448 | 430 | 408.0 | -22.0 |
| 1 | 10 | 2.0 | -8.0 | 280.0 | 279.0 | 263.0 | 2330 | 750 | 741.0 | -9.0 |
| 2 | 20 | 18.0 | -2.0 | 286.0 | 293.0 | 266.0 | 2296 | 806 | 811.0 | 5.0 |
| 3 | 20 | 15.0 | -5.0 | 285.0 | 281.0 | 258.0 | 2342 | 805 | 756.0 | -9.0 |
| 4 | 25 | 24.0 | -1.0 | 235.0 | 215.0 | 199.0 | 1448 | 320 | 259.0 | -21.0 |
In [56]:
flights.describe()
Out[56]:
| SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ELAPSED_TIME | AIR_TIME | DISTANCE | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 | 5.714008e+06 |
| mean | 1.328907e+03 | 1.335066e+03 | 9.294842e+00 | 1.418940e+02 | 1.370062e+02 | 1.135116e+02 | 8.244569e+02 | 1.493187e+03 | 1.476373e+03 | 4.407057e+00 |
| std | 4.835251e+02 | 4.964198e+02 | 3.688972e+01 | 7.531400e+01 | 7.421107e+01 | 7.223082e+01 | 6.086620e+02 | 5.069011e+02 | 5.259730e+02 | 3.927130e+01 |
| min | 1.000000e+00 | 1.000000e+00 | -8.200000e+01 | 1.800000e+01 | 1.400000e+01 | 7.000000e+00 | 3.100000e+01 | 1.000000e+00 | 1.000000e+00 | -8.700000e+01 |
| 25% | 9.160000e+02 | 9.210000e+02 | -5.000000e+00 | 8.500000e+01 | 8.200000e+01 | 6.000000e+01 | 3.730000e+02 | 1.110000e+03 | 1.058000e+03 | -1.300000e+01 |
| 50% | 1.325000e+03 | 1.330000e+03 | -2.000000e+00 | 1.230000e+02 | 1.180000e+02 | 9.400000e+01 | 6.500000e+02 | 1.520000e+03 | 1.512000e+03 | -5.000000e+00 |
| 75% | 1.730000e+03 | 1.740000e+03 | 7.000000e+00 | 1.740000e+02 | 1.680000e+02 | 1.440000e+02 | 1.065000e+03 | 1.917000e+03 | 1.916000e+03 | 8.000000e+00 |
| max | 2.359000e+03 | 2.400000e+03 | 1.988000e+03 | 7.180000e+02 | 7.660000e+02 | 6.900000e+02 | 4.983000e+03 | 2.400000e+03 | 2.400000e+03 | 1.971000e+03 |
In [57]:
import seaborn as sns
In [58]:
sched_dep = flights['SCHEDULED_DEPARTURE']
dep_time = flights['DEPARTURE_TIME']
dep_delay = flights['DEPARTURE_DELAY']
sched_time = flights['SCHEDULED_TIME']
elaps_time = flights['ELAPSED_TIME']
air_time = flights['AIR_TIME']
distance = flights['DISTANCE']
sched_arr = flights['SCHEDULED_ARRIVAL']
arr_time = flights['ARRIVAL_TIME']
arr_delay = flights['ARRIVAL_DELAY']
In [59]:
def variance(*values):
mean = sum(values) / len(values)
_variance = sum((v - mean) ** 2 for v in values) / len(values)
return _variance
In [60]:
variance(*arr_delay)
Out[60]:
1542.2345055354997
In [61]:
from math import sqrt
In [62]:
sqrt(1542.2345055354997)
Out[62]:
39.27129365752419
In [63]:
# Wartość wariancji dla poszczególnych kolumn
sched_dep_v = round(233796.44066774342, 4)
dep_time_v = round(246432.59147815884, 4)
dep_delay_v = round(1360.8514780264186, 4)
sched_time_v = round(5672.1972662605685, 4)
elaps_time_v = round(5507.282198783736, 4)
air_time_v = round(5217.290678876618, 4)
distance_v = round(370469.35273315175, 4)
sched_arr_v = round(256948.70146531807, 4)
arr_time_v = round(276647.565786137, 4)
arr_delay_v = round(1542.2345055354997, 4)
# Wartość odchylenia standardowego dla poszczególnych kolumn
sched_dep_sd = round(483.52501555529, 4)
dep_time_sd = round(496.41977345605284, 4)
dep_delay_sd = round(36.88972049265782, 4)
sched_time_sd = round(75.3139911720297, 4)
elaps_time_sd = round(74.21106520448103, 4)
air_time_sd = round(72.23081529981935, 4)
distance_sd = round(608.6619363268511, 4)
sched_arr_sd = round(506.9010766069826, 4)
arr_time_sd = round(525.9729705851214, 4)
arr_delay_sd = round(39.27129365752419, 4)
In [64]:
variance = [sched_dep_v, dep_time_v, dep_delay_v, sched_time_v, elaps_time_v, air_time_v, distance_v,
sched_arr_v, arr_time_v, arr_delay_v]
stand_dev = [sched_dep_sd, dep_time_sd, dep_delay_sd, sched_time_sd, elaps_time_sd, air_time_sd, distance_sd,
sched_arr_sd, arr_time_sd, arr_delay_sd]
df = pd.DataFrame([stand_dev], columns=['Planowany odlot', 'Czas odlotu', 'Opóźnienie odlotu', 'Planowany czas', 'Czas trwania lotu',
'Czas w powietrzu', 'Dystans', 'Planowany przylot', 'Czas przylotu', 'Opóźnienie przylotu'])
df.plot(kind='bar', figsize=(9, 4), title='Odchylenie standardowe.')
Out[64]:
<AxesSubplot:title={'center':'Odchylenie standardowe.'}>
In [65]:
from scipy import stats
In [66]:
# Współczynnik korelacji Pearsona pomiędzy Czasem odlotu a opóźnieniem odlotu.
departure_correlation = stats.pearsonr(dep_time, dep_delay)
departure_correlation
Out[66]:
PearsonRResult(statistic=0.17183955756177358, pvalue=0.0)
In [67]:
# Współczynnik korelacji Pearsona pomiędzy CZasem przylotu a opóźnieniem przylotu.
arrival_correlation = stats.pearsonr(arr_time, arr_delay)
arrival_correlation
Out[67]:
PearsonRResult(statistic=0.04987622833779577, pvalue=0.0)
In [68]:
# Współczynnik korelacji Pearsona pomiędzy Czasem lotu a dystansem.
air_time_dist = stats.pearsonr(air_time, distance)
air_time_dist
Out[68]:
PearsonRResult(statistic=0.9856434807477208, pvalue=0.0)
In [69]:
df
Out[69]:
| Planowany odlot | Czas odlotu | Opóźnienie odlotu | Planowany czas | Czas trwania lotu | Czas w powietrzu | Dystans | Planowany przylot | Czas przylotu | Opóźnienie przylotu | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 483.525 | 496.4198 | 36.8897 | 75.314 | 74.2111 | 72.2308 | 608.6619 | 506.9011 | 525.973 | 39.2713 |
In [70]:
# from ydata_profiling import ProfileReport
In [71]:
# profile = ProfileReport(flights, title='Profiling report.')
In [72]:
# profile.to_widgets()
In [73]:
# Rozkład zmiennej celu
res = stats.normaltest(arr_delay)
res
Out[73]:
NormaltestResult(statistic=7091359.885168821, pvalue=0.0)
In [74]:
# Test Kołogomorowa-Smirnowa
stats.kstest(arr_delay, stats.norm.cdf)
Out[74]:
KstestResult(statistic=0.5668584230275926, pvalue=0.0, statistic_location=-2.0, statistic_sign=1)
In [75]:
flights['ARRIVAL_DELAY'].describe()
Out[75]:
count 5.714008e+06 mean 4.407057e+00 std 3.927130e+01 min -8.700000e+01 25% -1.300000e+01 50% -5.000000e+00 75% 8.000000e+00 max 1.971000e+03 Name: ARRIVAL_DELAY, dtype: float64
In [76]:
sns.heatmap(round(flights.corr(), 2), annot=True, linewidths=0.5)
Out[76]:
<AxesSubplot:>
In [77]:
df = pd.DataFrame(flights, columns=['ARRIVAL_DELAY', 'DEPARTURE_DELAY']).sample(n=100000, random_state=1, replace=True)
In [78]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
In [79]:
X = df.values[:, :-1]
Y = df.values[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)
model = LinearRegression().fit(X_train, Y_train)
model.intercept_
model.coef_
y = model.score(X_train, Y_train)
z = model.score(X_test, Y_test)
y, z
Out[79]:
(0.8906115017107972, 0.8833381210519872)
In [80]:
y_pred = model.predict(X_test)
In [81]:
plt.scatter(X_test, Y_test, color='b', label='Actual Data')
plt.plot(X_test, y_pred, color='r', label='Regression Line')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.title('Linear Regression')
plt.show()
In [82]:
df = pd.DataFrame(flights, columns=['DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'])
In [83]:
df
Out[83]:
| DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_TIME | ARRIVAL_TIME | ARRIVAL_DELAY | |
|---|---|---|---|---|---|
| 0 | 2354.0 | -11.0 | 205.0 | 408.0 | -22.0 |
| 1 | 2.0 | -8.0 | 280.0 | 741.0 | -9.0 |
| 2 | 18.0 | -2.0 | 286.0 | 811.0 | 5.0 |
| 3 | 15.0 | -5.0 | 285.0 | 756.0 | -9.0 |
| 4 | 24.0 | -1.0 | 235.0 | 259.0 | -21.0 |
| ... | ... | ... | ... | ... | ... |
| 5819074 | 2355.0 | -4.0 | 320.0 | 753.0 | -26.0 |
| 5819075 | 2355.0 | -4.0 | 227.0 | 430.0 | -16.0 |
| 5819076 | 2350.0 | -9.0 | 221.0 | 432.0 | -8.0 |
| 5819077 | 2353.0 | -6.0 | 161.0 | 330.0 | -10.0 |
| 5819078 | 14.0 | 15.0 | 221.0 | 442.0 | 2.0 |
5714008 rows × 5 columns
In [84]:
X = df.values[:, :-1]
Y = df.values[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)
model = LinearRegression().fit(X_train, Y_train)
model.intercept_ # wyraz wolny
model.coef_ # współczynnik nachylenia (slope)
y = model.score(X_train, Y_train)
z = model.score(X_test, Y_test)
y, z
Out[84]:
(0.896243087041473, 0.8944133208242555)
In [85]:
y_prediction = model.predict(X_train)
print('MSE on train data = ' , metrics.mean_squared_error(Y_train, y_prediction))
MSE on train data = 160.96995674625435