Προαπαιτούμενα
rm (list = ls ())
if (! require (tidyverse)){
install.packages ("tidyverse" )
library (tidyverse)
}
Loading required package: tidyverse
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.1 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (! require (nycflights13)){
install.packages ("nycflights13" )
library (nycflights13)
}
Loading required package: nycflights13
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Επιλογή γραμμών με βάση κάποια κριτήρια (filter)
Απλή
8 Οκτωβρίου
filter (flights, month== 10 , day== 8 )
# A tibble: 964 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 10 8 141 2059 282 254 2242
2 2013 10 8 450 500 -10 633 648
3 2013 10 8 508 517 -9 729 757
4 2013 10 8 540 545 -5 922 933
5 2013 10 8 547 550 -3 916 932
6 2013 10 8 550 545 5 822 827
7 2013 10 8 550 600 -10 647 708
8 2013 10 8 552 600 -8 649 701
9 2013 10 8 553 600 -7 849 856
10 2013 10 8 554 600 -6 844 851
# ℹ 954 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Στις αρχές του έτους.
filter (flights, month<= 3 )
# A tibble: 80,789 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 80,779 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
filter (flights, origin == "EWR" )
# A tibble: 120,835 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 554 558 -4 740 728
3 2013 1 1 555 600 -5 913 854
4 2013 1 1 558 600 -2 923 937
5 2013 1 1 559 600 -1 854 902
6 2013 1 1 601 600 1 844 850
7 2013 1 1 606 610 -4 858 910
8 2013 1 1 607 607 0 858 915
9 2013 1 1 608 600 8 807 735
10 2013 1 1 615 615 0 833 842
# ℹ 120,825 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Λογικοί τελεστές
Καλοκαίρι
filter (flights, month<= 8 & month>= 6 )
# A tibble: 86,995 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 6 1 2 2359 3 341 350
2 2013 6 1 451 500 -9 624 640
3 2013 6 1 506 515 -9 715 800
4 2013 6 1 534 545 -11 800 829
5 2013 6 1 538 545 -7 925 922
6 2013 6 1 539 540 -1 832 840
7 2013 6 1 546 600 -14 850 910
8 2013 6 1 551 600 -9 828 850
9 2013 6 1 552 600 -8 647 655
10 2013 6 1 553 600 -7 700 711
# ℹ 86,985 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Πάσχα
filter (flights,(month == 4 & day >= 29 ) | (month== 5 & day<= 12 ))
# A tibble: 13,018 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 4 29 2 2345 17 222 241
2 2013 4 29 15 2359 16 350 339
3 2013 4 29 23 2249 94 117 2359
4 2013 4 29 451 500 -9 634 640
5 2013 4 29 510 515 -5 744 802
6 2013 4 29 534 545 -11 816 828
7 2013 4 29 538 540 -2 834 840
8 2013 4 29 539 545 -6 910 927
9 2013 4 29 550 600 -10 723 805
10 2013 4 29 551 600 -9 853 852
# ℹ 13,008 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Διάταξη
Διάταξη γραμμών (arrange)
Με βάση την καθυστέρηση σε αύξουσα σειρά.
arrange (flights, dep_delay)
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 12 7 2040 2123 -43 40 2352
2 2013 2 3 2022 2055 -33 2240 2338
3 2013 11 10 1408 1440 -32 1549 1559
4 2013 1 11 1900 1930 -30 2233 2243
5 2013 1 29 1703 1730 -27 1947 1957
6 2013 8 9 729 755 -26 1002 955
7 2013 10 23 1907 1932 -25 2143 2143
8 2013 3 30 2030 2055 -25 2213 2250
9 2013 3 2 1431 1455 -24 1601 1631
10 2013 5 5 934 958 -24 1225 1309
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Με βάση την καθυστέρηση σε φθίνουσα σειρά.
arrange (flights, desc (dep_delay))
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 9 641 900 1301 1242 1530
2 2013 6 15 1432 1935 1137 1607 2120
3 2013 1 10 1121 1635 1126 1239 1810
4 2013 9 20 1139 1845 1014 1457 2210
5 2013 7 22 845 1600 1005 1044 1815
6 2013 4 10 1100 1900 960 1342 2211
7 2013 3 17 2321 810 911 135 1020
8 2013 6 27 959 1900 899 1236 2226
9 2013 7 22 2257 759 898 121 1026
10 2013 12 5 756 1700 896 1058 2020
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Διάταξη στηλών (select)
Βάζουμε τις στήλες των καθυστερήσεων πρώτες
select (flights, dep_delay, arr_delay, everything ())
# A tibble: 336,776 × 19
dep_delay arr_delay year month day dep_time sched_dep_time arr_time
<dbl> <dbl> <int> <int> <int> <int> <int> <int>
1 2 11 2013 1 1 517 515 830
2 4 20 2013 1 1 533 529 850
3 2 33 2013 1 1 542 540 923
4 -1 -18 2013 1 1 544 545 1004
5 -6 -25 2013 1 1 554 600 812
6 -4 12 2013 1 1 554 558 740
7 -5 19 2013 1 1 555 600 913
8 -3 -14 2013 1 1 557 600 709
9 -3 -8 2013 1 1 557 600 838
10 -2 8 2013 1 1 558 600 753
# ℹ 336,766 more rows
# ℹ 11 more variables: sched_arr_time <int>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Επιλογή στηλών (select)
select (flights, year, month)
# A tibble: 336,776 × 2
year month
<int> <int>
1 2013 1
2 2013 1
3 2013 1
4 2013 1
5 2013 1
6 2013 1
7 2013 1
8 2013 1
9 2013 1
10 2013 1
# ℹ 336,766 more rows
select (flights, origin, dep_delay)
# A tibble: 336,776 × 2
origin dep_delay
<chr> <dbl>
1 EWR 2
2 LGA 4
3 JFK 2
4 JFK -1
5 LGA -6
6 EWR -4
7 EWR -5
8 LGA -3
9 JFK -3
10 LGA -2
# ℹ 336,766 more rows
Επιλογή στηλών από dep_time έως arr_time.
select (flights, dep_time: arr_time)
# A tibble: 336,776 × 4
dep_time sched_dep_time dep_delay arr_time
<int> <int> <dbl> <int>
1 517 515 2 830
2 533 529 4 850
3 542 540 2 923
4 544 545 -1 1004
5 554 600 -6 812
6 554 558 -4 740
7 555 600 -5 913
8 557 600 -3 709
9 557 600 -3 838
10 558 600 -2 753
# ℹ 336,766 more rows
Επιλογή στηλών εκτός από αυτές μεταξύ των dep_time έως arr_time.
select (flights,- (dep_time: arr_time))
# A tibble: 336,776 × 15
year month day sched_arr_time arr_delay carrier flight tailnum origin
<int> <int> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 2013 1 1 819 11 UA 1545 N14228 EWR
2 2013 1 1 830 20 UA 1714 N24211 LGA
3 2013 1 1 850 33 AA 1141 N619AA JFK
4 2013 1 1 1022 -18 B6 725 N804JB JFK
5 2013 1 1 837 -25 DL 461 N668DN LGA
6 2013 1 1 728 12 UA 1696 N39463 EWR
7 2013 1 1 854 19 B6 507 N516JB EWR
8 2013 1 1 723 -14 EV 5708 N829AS LGA
9 2013 1 1 846 -8 B6 79 N593JB JFK
10 2013 1 1 745 8 AA 301 N3ALAA LGA
# ℹ 336,766 more rows
# ℹ 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
Νέες μεταβλητές
Προσθήκη νέων μεταβλητών (mutate)
flightsShort <- select (flights, year: day, ends_with ("delay" ), distance, air_time)
flightsShort
# A tibble: 336,776 × 7
year month day dep_delay arr_delay distance air_time
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227
2 2013 1 1 4 20 1416 227
3 2013 1 1 2 33 1089 160
4 2013 1 1 -1 -18 1576 183
5 2013 1 1 -6 -25 762 116
6 2013 1 1 -4 12 719 150
7 2013 1 1 -5 19 1065 158
8 2013 1 1 -3 -14 229 53
9 2013 1 1 -3 -8 944 140
10 2013 1 1 -2 8 733 138
# ℹ 336,766 more rows
Προσθήκη αναμονής και ταχύτητας
mutate (flightsShort, gain = arr_delay- dep_delay, speed = distance/ air_time * 60 )
# A tibble: 336,776 × 9
year month day dep_delay arr_delay distance air_time gain speed
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227 9 370.
2 2013 1 1 4 20 1416 227 16 374.
3 2013 1 1 2 33 1089 160 31 408.
4 2013 1 1 -1 -18 1576 183 -17 517.
5 2013 1 1 -6 -25 762 116 -19 394.
6 2013 1 1 -4 12 719 150 16 288.
7 2013 1 1 -5 19 1065 158 24 404.
8 2013 1 1 -3 -14 229 53 -11 259.
9 2013 1 1 -3 -8 944 140 -5 405.
10 2013 1 1 -2 8 733 138 10 319.
# ℹ 336,766 more rows
Πίνακας με μόνο νέες&επιλεγμένες μεταβλητές (transmute)
transmute (flights, origin, gain = arr_delay- dep_delay, speed = distance/ air_time * 60 )
# A tibble: 336,776 × 3
origin gain speed
<chr> <dbl> <dbl>
1 EWR 9 370.
2 LGA 16 374.
3 JFK 31 408.
4 JFK -17 517.
5 LGA -19 394.
6 EWR 16 288.
7 EWR 24 404.
8 LGA -11 259.
9 JFK -5 405.
10 LGA 10 319.
# ℹ 336,766 more rows
Ομαδοποίηση (group_by)
Οι πτήσεις χωρισμένες σε ομάδες πτήσεων Ιανουαρίου, Φεβρουαρίου κτλ, ανεξάρτητα από το έτος
by_month <- group_by (flights, month)
by_month
# A tibble: 336,776 × 19
# Groups: month [12]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Οι πτήσεις χωρισμένες σε ομάδες πτήσεων Ιανουαρίου, Φεβρουαρίου κτλ στο έτος
by_month2 <- group_by (flights, year, month)
by_month2
# A tibble: 336,776 × 19
# Groups: year, month [12]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Οι πτήσεις χωρισμένες σε ομάδες ανά ημέρα.
by_day <- group_by (flights, year, month, day)
by_day
# A tibble: 336,776 × 19
# Groups: year, month, day [365]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Σύνοψη (summarise)
Μέση καθυστέρηση ανά μήνα.
summarise (by_month,
delay = mean (dep_delay, na.rm= T)
)
# A tibble: 12 × 2
month delay
<int> <dbl>
1 1 10.0
2 2 10.8
3 3 13.2
4 4 13.9
5 5 13.0
6 6 20.8
7 7 21.7
8 8 12.6
9 9 6.72
10 10 6.24
11 11 5.44
12 12 16.6
Μέση καθυσ΄τερηση ανά συγκεκριμένη ημέρα του έτους.
summarise (by_day,delay = mean (dep_delay, na.rm= T))
`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.
# A tibble: 365 × 4
# Groups: year, month [12]
year month day delay
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# ℹ 355 more rows
Αγωγός (%>%)
Χωρίζουμε τις πτήσεις σε ομάδες ανά προορισμό.
by_dest <- group_by (flights,dest)
by_dest
# A tibble: 336,776 × 19
# Groups: dest [105]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
delay <- summarise (by_dest,
count = n (), # Μετρά τις πτήσεις στον κάθε προορισμό
dist = mean (distance, na.rm= T), # Μέση απόσταση προς τον εκάστοτε προορισμό
delay = mean (arr_delay, na.rm= T) # Μέση καθυστέρηση που αφορά τον εκάστοτε προορισμό
)
delay <- filter (delay, count> 20 , dest != "JFK" ) # Κρατάμε δημοφιλείς προορισμούς, εκτός του αεροδρομίου JFK
delay
# A tibble: 97 × 4
dest count dist delay
<chr> <int> <dbl> <dbl>
1 ABQ 254 1826 4.38
2 ACK 265 199 4.85
3 ALB 439 143 14.4
4 ATL 17215 757. 11.3
5 AUS 2439 1514. 6.02
6 AVL 275 584. 8.00
7 BDL 443 116 7.05
8 BGR 375 378 8.03
9 BHM 297 866. 16.9
10 BNA 6333 758. 11.8
# ℹ 87 more rows
Ο αγωγός %>% σημαίνει ακολούθως:
delayA <- flights %>% # πτήσεις ΑΚΟΛΟΥΘΩΣ
group_by (dest) %>% # ομαδοποίησε βάσει προορισμού ΑΚΟΛΟΥΘΩΣ
summarise ( # συνόψισε
count = n (),
dist = mean (distance, na.rm= T),
delay = mean (arr_delay, na.rm= T)
) %>% # ΑΚΟΛΟΥΘΩΣ
filter (count> 20 , dest != "JFK" ) # φιλτράρισε
delayA
# A tibble: 97 × 4
dest count dist delay
<chr> <int> <dbl> <dbl>
1 ABQ 254 1826 4.38
2 ACK 265 199 4.85
3 ALB 439 143 14.4
4 ATL 17215 757. 11.3
5 AUS 2439 1514. 6.02
6 AVL 275 584. 8.00
7 BDL 443 116 7.05
8 BGR 375 378 8.03
9 BHM 297 866. 16.9
10 BNA 6333 758. 11.8
# ℹ 87 more rows
flights %>% # πάρε τις πτήσεις ΑΚΟΛΟΥΘΩΣ
filter (! is.na (dep_delay), ! is.na (arr_delay)) %>% # διώξε τις ακυρώσεις ΑΚΟΛΟΥΘΩΣ
group_by (tailnum) %>% # ομαδοποίησε βάσει αριθμού πτήσης ΑΚΟΛΟΥΘΩΣ
summarise ( # συνόψισε
delay = mean (arr_delay, na.rm= T),
n = n ()
) %>% # ΑΚΟΛΟΥΘΩΣ
ggplot (mapping = aes (x= n, y= delay))+ geom_point (alpha = 1 / 10 ) # κάνε διάγραμμα διασποράς