library(dplyr)
library(tidyr)
library(readr)
Clean Data
catch_original <- read.csv(file="https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1")
Split, Apply, Combine
- remove unnecessary columns
catch_data <- catch_original %>%
select(Region, Year, Chinook, Sockeye, Coho, Pink, Chum)
# select(-All, -notesRegCode)
head(catch_data)
## Region Year Chinook Sockeye Coho Pink Chum
## 1 SSE 1886 0 5 0 0 0
## 2 SSE 1887 0 155 0 0 0
## 3 SSE 1888 0 224 16 0 0
## 4 SSE 1889 0 182 11 92 0
## 5 SSE 1890 0 251 42 0 0
## 6 SSE 1891 0 274 24 0 0
summary(catch_data)
## Region Year Chinook Sockeye
## Length:1708 Min. :1878 Length:1708 Min. : 0.00
## Class :character 1st Qu.:1922 Class :character 1st Qu.: 6.75
## Mode :character Median :1947 Mode :character Median : 330.50
## Mean :1946 Mean : 1401.09
## 3rd Qu.:1972 3rd Qu.: 995.50
## Max. :1997 Max. :44269.00
## Coho Pink Chum
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 41.5 Median : 34.5 Median : 63.0
## Mean : 150.4 Mean : 2357.8 Mean : 422.0
## 3rd Qu.: 175.0 3rd Qu.: 1622.5 3rd Qu.: 507.5
## Max. :3220.0 Max. :53676.0 Max. :10459.0
glimpse(catch_data)
## Rows: 1,708
## Columns: 7
## $ Region <chr> "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE",…
## $ Year <int> 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 18…
## $ Chinook <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "3", "4", "5", "9…
## $ Sockeye <int> 5, 155, 224, 182, 251, 274, 207, 189, 253, 408, 989, 791, 708,…
## $ Coho <int> 0, 0, 16, 11, 42, 24, 11, 1, 5, 8, 192, 161, 132, 139, 84, 107…
## $ Pink <int> 0, 0, 0, 92, 0, 0, 8, 187, 529, 606, 996, 2218, 673, 1545, 204…
## $ Chum <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 1, 2, 0, 0, 0, 102, 343…
catch_clean <- catch_data %>%
mutate(Chinook=as.numeric(Chinook))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
head(catch_clean)
## Region Year Chinook Sockeye Coho Pink Chum
## 1 SSE 1886 0 5 0 0 0
## 2 SSE 1887 0 155 0 0 0
## 3 SSE 1888 0 224 16 0 0
## 4 SSE 1889 0 182 11 92 0
## 5 SSE 1890 0 251 42 0 0
## 6 SSE 1891 0 274 24 0 0
catch_clean <- catch_data %>%
mutate(Chinook=if_else(Chinook == "I", "1", Chinook)) %>%
mutate(Chinook=as.integer(Chinook))
head(catch_clean)
## Region Year Chinook Sockeye Coho Pink Chum
## 1 SSE 1886 0 5 0 0 0
## 2 SSE 1887 0 155 0 0 0
## 3 SSE 1888 0 224 16 0 0
## 4 SSE 1889 0 182 11 92 0
## 5 SSE 1890 0 251 42 0 0
## 6 SSE 1891 0 274 24 0 0
catch_long <- catch_clean %>%
pivot_longer(cols=-c(Region, Year), names_to="species", values_to="catch")
head(catch_long)
## # A tibble: 6 × 4
## Region Year species catch
## <chr> <int> <chr> <int>
## 1 SSE 1886 Chinook 0
## 2 SSE 1886 Sockeye 5
## 3 SSE 1886 Coho 0
## 4 SSE 1886 Pink 0
## 5 SSE 1886 Chum 0
## 6 SSE 1887 Chinook 0
catch_wide <- catch_long %>%
pivot_wider(names_from=species, values_from=catch)
head(catch_wide)
## # A tibble: 6 × 7
## Region Year Chinook Sockeye Coho Pink Chum
## <chr> <int> <int> <int> <int> <int> <int>
## 1 SSE 1886 0 5 0 0 0
## 2 SSE 1887 0 155 0 0 0
## 3 SSE 1888 0 224 16 0 0
## 4 SSE 1889 0 182 11 92 0
## 5 SSE 1890 0 251 42 0 0
## 6 SSE 1891 0 274 24 0 0
catch_long <- catch_long %>%
rename(catch_thousands=catch)
head(catch_long)
## # A tibble: 6 × 4
## Region Year species catch_thousands
## <chr> <int> <chr> <int>
## 1 SSE 1886 Chinook 0
## 2 SSE 1886 Sockeye 5
## 3 SSE 1886 Coho 0
## 4 SSE 1886 Pink 0
## 5 SSE 1886 Chum 0
## 6 SSE 1887 Chinook 0
catch_long <- catch_long %>%
mutate(catch=catch_thousands * 1000)
head(catch_long)
## # A tibble: 6 × 5
## Region Year species catch_thousands catch
## <chr> <int> <chr> <int> <dbl>
## 1 SSE 1886 Chinook 0 0
## 2 SSE 1886 Sockeye 5 5000
## 3 SSE 1886 Coho 0 0
## 4 SSE 1886 Pink 0 0
## 5 SSE 1886 Chum 0 0
## 6 SSE 1887 Chinook 0 0
catch_long <- catch_long %>%
mutate(catch=catch_thousands * 1000) %>%
select(-catch_thousands)
head(catch_long)
## # A tibble: 6 × 4
## Region Year species catch
## <chr> <int> <chr> <dbl>
## 1 SSE 1886 Chinook 0
## 2 SSE 1886 Sockeye 5000
## 3 SSE 1886 Coho 0
## 4 SSE 1886 Pink 0
## 5 SSE 1886 Chum 0
## 6 SSE 1887 Chinook 0
Summarize
mean_region <- catch_long %>%
group_by(Region) %>%
summarise(catch_mean=mean(catch))
head(mean_region)
## # A tibble: 6 × 2
## Region catch_mean
## <chr> <dbl>
## 1 ALU 40384.
## 2 BER 16373.
## 3 BRB 2709796.
## 4 CHG 315487.
## 5 CKI 683571.
## 6 COP 179223.
n_region <- catch_long %>%
group_by(Region) %>%
summarize(n=n())
head(n_region)
## # A tibble: 6 × 2
## Region n
## <chr> <int>
## 1 ALU 435
## 2 BER 510
## 3 BRB 570
## 4 CHG 550
## 5 CKI 525
## 6 COP 470
var_region <- catch_long %>%
group_by(Region) %>%
summarize(catch_var=var(catch))
head(var_region)
## # A tibble: 6 × 2
## Region catch_var
## <chr> <dbl>
## 1 ALU 4.99e10
## 2 BER 1.73e 9
## 3 BRB 4.30e13
## 4 CHG 2.59e11
## 5 CKI 1.19e12
## 6 COP 1.14e11
mean_sp_reg <- catch_long %>%
group_by(Region,species) %>%
summarize(mean_sp_reg=mean(catch))
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
head(mean_sp_reg)
## # A tibble: 6 × 3
## # Groups: Region [2]
## Region species mean_sp_reg
## <chr> <chr> <dbl>
## 1 ALU Chinook 23.0
## 2 ALU Chum 2908.
## 3 ALU Coho 195.
## 4 ALU Pink 191954.
## 5 ALU Sockeye 6839.
## 6 BER Chinook 19.6
Filter
SSE_catch <- catch_long %>%
filter(Region == "SSE")
head(SSE_catch)
## # A tibble: 6 × 4
## Region Year species catch
## <chr> <int> <chr> <dbl>
## 1 SSE 1886 Chinook 0
## 2 SSE 1886 Sockeye 5000
## 3 SSE 1886 Coho 0
## 4 SSE 1886 Pink 0
## 5 SSE 1886 Chum 0
## 6 SSE 1887 Chinook 0
mill_catch <- catch_long %>%
filter(catch > 1000000)
head(mill_catch)
## # A tibble: 6 × 4
## Region Year species catch
## <chr> <int> <chr> <dbl>
## 1 SSE 1897 Pink 2218000
## 2 SSE 1899 Pink 1545000
## 3 SSE 1900 Pink 2040000
## 4 SSE 1901 Pink 1049000
## 5 SSE 1902 Pink 1547000
## 6 SSE 1904 Sockeye 1143000
SSE_Chin_catch <- catch_long %>%
filter(Region=="SSE", species=="Chinook")
head(SSE_Chin_catch)
## # A tibble: 6 × 4
## Region Year species catch
## <chr> <int> <chr> <dbl>
## 1 SSE 1886 Chinook 0
## 2 SSE 1887 Chinook 0
## 3 SSE 1888 Chinook 0
## 4 SSE 1889 Chinook 0
## 5 SSE 1890 Chinook 0
## 6 SSE 1891 Chinook 0
Sorting
mean_region <- catch_long %>%
group_by(Region) %>%
summarise(mean_catch = mean(catch)) %>%
arrange(mean_catch)
head(mean_region)
## # A tibble: 6 × 2
## Region mean_catch
## <chr> <dbl>
## 1 BER 16373.
## 2 KTZ 18836.
## 3 ALU 40384.
## 4 NRS 51503.
## 5 KSK 67642.
## 6 YUK 68646.
mean_region <- catch_long %>%
group_by(Region) %>%
summarise(mean_catch = mean(catch)) %>%
arrange(desc(mean_catch))
head(mean_region)
## # A tibble: 6 × 2
## Region mean_catch
## <chr> <dbl>
## 1 SSE 3184661.
## 2 BRB 2709796.
## 3 NSE 1825021.
## 4 KOD 1528350
## 5 PWS 1419237.
## 6 SOP 1110942.
Joins
region_defs <- read.csv("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1") %>%
select(code, mgmtArea)
head(region_defs)
## code mgmtArea
## 1 GSE Unallocated Southeast Alaska
## 2 NSE Northern Southeast Alaska
## 3 SSE Southern Southeast Alaska
## 4 YAK Yakutat
## 5 PWSmgmt Prince William Sound Management Area
## 6 BER Bering River Subarea Copper River Subarea
catch_joined <- left_join(catch_long, region_defs, by = c("Region" = "code"))
head(catch_joined)
## # A tibble: 6 × 5
## Region Year species catch mgmtArea
## <chr> <int> <chr> <dbl> <chr>
## 1 SSE 1886 Chinook 0 Southern Southeast Alaska
## 2 SSE 1886 Sockeye 5000 Southern Southeast Alaska
## 3 SSE 1886 Coho 0 Southern Southeast Alaska
## 4 SSE 1886 Pink 0 Southern Southeast Alaska
## 5 SSE 1886 Chum 0 Southern Southeast Alaska
## 6 SSE 1887 Chinook 0 Southern Southeast Alaska
region_defs <- region_defs %>%
rename(Region = code, Region_Name = mgmtArea)
catch_joined <- left_join(catch_long, region_defs, by = c("Region"))
head(catch_joined)
## # A tibble: 6 × 5
## Region Year species catch Region_Name
## <chr> <int> <chr> <dbl> <chr>
## 1 SSE 1886 Chinook 0 Southern Southeast Alaska
## 2 SSE 1886 Sockeye 5000 Southern Southeast Alaska
## 3 SSE 1886 Coho 0 Southern Southeast Alaska
## 4 SSE 1886 Pink 0 Southern Southeast Alaska
## 5 SSE 1886 Chum 0 Southern Southeast Alaska
## 6 SSE 1887 Chinook 0 Southern Southeast Alaska
sites_df <- data.frame(site = c("HAW-101",
"HAW-103",
"OAH-320",
"OAH-219",
"MAI-039"))
sites_df %>%
separate(site, c("island", "site_number"), "-")
## island site_number
## 1 HAW 101
## 2 HAW 103
## 3 OAH 320
## 4 OAH 219
## 5 MAI 039
cities_df <- data.frame(city = c("Juneau AK",
"Sitka AK",
"Anchorage AK"))
cities_df %>%
separate(city, c("city", "state"), " ")
## city state
## 1 Juneau AK
## 2 Sitka AK
## 3 Anchorage AK
dates_df <- data.frame(year = c("1930",
"1930",
"1930"),
month = c("12",
"12",
"12"),
day = c("14",
"15",
"16"))
dates_df %>%
unite(date, year, month, day, sep = "-")
## date
## 1 1930-12-14
## 2 1930-12-15
## 3 1930-12-16
cities_df <- data.frame(cities_df %>%
separate(city, c("city", "state"), " "))
#cities_df <-
cities_df %>%
unite(city_state, city, state, sep=" ")
## city_state
## 1 Juneau AK
## 2 Sitka AK
## 3 Anchorage AK
#cities_df
Final Summary
catch_original <- read.csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1", method = "libcurl"))
region_defs <- read.csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1", method = "libcurl")) %>%
select(code, mgmtArea)
mean_region <- catch_original %>%
select(-All, -notesRegCode) %>%
mutate(Chinook = ifelse(Chinook == "I", 1, Chinook)) %>%
mutate(Chinook = as.numeric(Chinook)) %>%
pivot_longer(-c(Region, Year), names_to = "species", values_to = "catch") %>%
mutate(catch = catch*1000) %>%
group_by(Region) %>%
summarize(mean_catch = mean(catch)) %>%
left_join(region_defs, by = c("Region" = "code"))
head(mean_region)
## # A tibble: 6 × 3
## Region mean_catch mgmtArea
## <chr> <dbl> <chr>
## 1 ALU 40384. Aleutian Islands Subarea
## 2 BER 16373. Bering River Subarea Copper River Subarea
## 3 BRB 2709796. Bristol Bay Management Area
## 4 CHG 315487. Chignik Management Area
## 5 CKI 683571. Cook Inlet Management Area
## 6 COP 179223. Copper River Subarea