library(dplyr)
library(tidyr)
library(readr)

Clean Data

catch_original <- read.csv(file="https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1")

Split, Apply, Combine

catch_data <- catch_original %>%  
  select(Region, Year, Chinook, Sockeye, Coho, Pink, Chum)
  # select(-All, -notesRegCode)
head(catch_data)
##   Region Year Chinook Sockeye Coho Pink Chum
## 1    SSE 1886       0       5    0    0    0
## 2    SSE 1887       0     155    0    0    0
## 3    SSE 1888       0     224   16    0    0
## 4    SSE 1889       0     182   11   92    0
## 5    SSE 1890       0     251   42    0    0
## 6    SSE 1891       0     274   24    0    0
summary(catch_data)
##     Region               Year        Chinook             Sockeye        
##  Length:1708        Min.   :1878   Length:1708        Min.   :    0.00  
##  Class :character   1st Qu.:1922   Class :character   1st Qu.:    6.75  
##  Mode  :character   Median :1947   Mode  :character   Median :  330.50  
##                     Mean   :1946                      Mean   : 1401.09  
##                     3rd Qu.:1972                      3rd Qu.:  995.50  
##                     Max.   :1997                      Max.   :44269.00  
##       Coho             Pink              Chum        
##  Min.   :   0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:   0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :  41.5   Median :   34.5   Median :   63.0  
##  Mean   : 150.4   Mean   : 2357.8   Mean   :  422.0  
##  3rd Qu.: 175.0   3rd Qu.: 1622.5   3rd Qu.:  507.5  
##  Max.   :3220.0   Max.   :53676.0   Max.   :10459.0
glimpse(catch_data)
## Rows: 1,708
## Columns: 7
## $ Region  <chr> "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE", "SSE",…
## $ Year    <int> 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 18…
## $ Chinook <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "3", "4", "5", "9…
## $ Sockeye <int> 5, 155, 224, 182, 251, 274, 207, 189, 253, 408, 989, 791, 708,…
## $ Coho    <int> 0, 0, 16, 11, 42, 24, 11, 1, 5, 8, 192, 161, 132, 139, 84, 107…
## $ Pink    <int> 0, 0, 0, 92, 0, 0, 8, 187, 529, 606, 996, 2218, 673, 1545, 204…
## $ Chum    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 1, 2, 0, 0, 0, 102, 343…
catch_clean <- catch_data %>% 
  mutate(Chinook=as.numeric(Chinook))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
head(catch_clean)
##   Region Year Chinook Sockeye Coho Pink Chum
## 1    SSE 1886       0       5    0    0    0
## 2    SSE 1887       0     155    0    0    0
## 3    SSE 1888       0     224   16    0    0
## 4    SSE 1889       0     182   11   92    0
## 5    SSE 1890       0     251   42    0    0
## 6    SSE 1891       0     274   24    0    0
catch_clean <- catch_data %>% 
  mutate(Chinook=if_else(Chinook == "I", "1", Chinook)) %>% 
  mutate(Chinook=as.integer(Chinook))
head(catch_clean)
##   Region Year Chinook Sockeye Coho Pink Chum
## 1    SSE 1886       0       5    0    0    0
## 2    SSE 1887       0     155    0    0    0
## 3    SSE 1888       0     224   16    0    0
## 4    SSE 1889       0     182   11   92    0
## 5    SSE 1890       0     251   42    0    0
## 6    SSE 1891       0     274   24    0    0
catch_long <- catch_clean %>% 
  pivot_longer(cols=-c(Region, Year), names_to="species", values_to="catch")
head(catch_long)
## # A tibble: 6 × 4
##   Region  Year species catch
##   <chr>  <int> <chr>   <int>
## 1 SSE     1886 Chinook     0
## 2 SSE     1886 Sockeye     5
## 3 SSE     1886 Coho        0
## 4 SSE     1886 Pink        0
## 5 SSE     1886 Chum        0
## 6 SSE     1887 Chinook     0
catch_wide <- catch_long %>% 
  pivot_wider(names_from=species, values_from=catch)
head(catch_wide)
## # A tibble: 6 × 7
##   Region  Year Chinook Sockeye  Coho  Pink  Chum
##   <chr>  <int>   <int>   <int> <int> <int> <int>
## 1 SSE     1886       0       5     0     0     0
## 2 SSE     1887       0     155     0     0     0
## 3 SSE     1888       0     224    16     0     0
## 4 SSE     1889       0     182    11    92     0
## 5 SSE     1890       0     251    42     0     0
## 6 SSE     1891       0     274    24     0     0
catch_long <- catch_long %>% 
  rename(catch_thousands=catch)
head(catch_long)
## # A tibble: 6 × 4
##   Region  Year species catch_thousands
##   <chr>  <int> <chr>             <int>
## 1 SSE     1886 Chinook               0
## 2 SSE     1886 Sockeye               5
## 3 SSE     1886 Coho                  0
## 4 SSE     1886 Pink                  0
## 5 SSE     1886 Chum                  0
## 6 SSE     1887 Chinook               0
catch_long <- catch_long %>% 
  mutate(catch=catch_thousands * 1000)
head(catch_long)
## # A tibble: 6 × 5
##   Region  Year species catch_thousands catch
##   <chr>  <int> <chr>             <int> <dbl>
## 1 SSE     1886 Chinook               0     0
## 2 SSE     1886 Sockeye               5  5000
## 3 SSE     1886 Coho                  0     0
## 4 SSE     1886 Pink                  0     0
## 5 SSE     1886 Chum                  0     0
## 6 SSE     1887 Chinook               0     0
catch_long <- catch_long %>% 
  mutate(catch=catch_thousands * 1000) %>% 
  select(-catch_thousands)
head(catch_long)
## # A tibble: 6 × 4
##   Region  Year species catch
##   <chr>  <int> <chr>   <dbl>
## 1 SSE     1886 Chinook     0
## 2 SSE     1886 Sockeye  5000
## 3 SSE     1886 Coho        0
## 4 SSE     1886 Pink        0
## 5 SSE     1886 Chum        0
## 6 SSE     1887 Chinook     0

Summarize

mean_region <- catch_long %>% 
  group_by(Region) %>%
  summarise(catch_mean=mean(catch))
head(mean_region)
## # A tibble: 6 × 2
##   Region catch_mean
##   <chr>       <dbl>
## 1 ALU        40384.
## 2 BER        16373.
## 3 BRB      2709796.
## 4 CHG       315487.
## 5 CKI       683571.
## 6 COP       179223.
n_region <- catch_long %>% 
  group_by(Region) %>%
  summarize(n=n())
head(n_region)
## # A tibble: 6 × 2
##   Region     n
##   <chr>  <int>
## 1 ALU      435
## 2 BER      510
## 3 BRB      570
## 4 CHG      550
## 5 CKI      525
## 6 COP      470
var_region <- catch_long %>% 
  group_by(Region) %>%
  summarize(catch_var=var(catch))
head(var_region)
## # A tibble: 6 × 2
##   Region catch_var
##   <chr>      <dbl>
## 1 ALU      4.99e10
## 2 BER      1.73e 9
## 3 BRB      4.30e13
## 4 CHG      2.59e11
## 5 CKI      1.19e12
## 6 COP      1.14e11
mean_sp_reg <- catch_long %>% 
  group_by(Region,species) %>%
  summarize(mean_sp_reg=mean(catch))
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
head(mean_sp_reg)
## # A tibble: 6 × 3
## # Groups:   Region [2]
##   Region species mean_sp_reg
##   <chr>  <chr>         <dbl>
## 1 ALU    Chinook        23.0
## 2 ALU    Chum         2908. 
## 3 ALU    Coho          195. 
## 4 ALU    Pink       191954. 
## 5 ALU    Sockeye      6839. 
## 6 BER    Chinook        19.6

Filter

SSE_catch <- catch_long %>% 
  filter(Region == "SSE")
head(SSE_catch)
## # A tibble: 6 × 4
##   Region  Year species catch
##   <chr>  <int> <chr>   <dbl>
## 1 SSE     1886 Chinook     0
## 2 SSE     1886 Sockeye  5000
## 3 SSE     1886 Coho        0
## 4 SSE     1886 Pink        0
## 5 SSE     1886 Chum        0
## 6 SSE     1887 Chinook     0
mill_catch <- catch_long %>% 
  filter(catch > 1000000)
head(mill_catch)
## # A tibble: 6 × 4
##   Region  Year species   catch
##   <chr>  <int> <chr>     <dbl>
## 1 SSE     1897 Pink    2218000
## 2 SSE     1899 Pink    1545000
## 3 SSE     1900 Pink    2040000
## 4 SSE     1901 Pink    1049000
## 5 SSE     1902 Pink    1547000
## 6 SSE     1904 Sockeye 1143000
SSE_Chin_catch <- catch_long %>% 
  filter(Region=="SSE", species=="Chinook")
head(SSE_Chin_catch)
## # A tibble: 6 × 4
##   Region  Year species catch
##   <chr>  <int> <chr>   <dbl>
## 1 SSE     1886 Chinook     0
## 2 SSE     1887 Chinook     0
## 3 SSE     1888 Chinook     0
## 4 SSE     1889 Chinook     0
## 5 SSE     1890 Chinook     0
## 6 SSE     1891 Chinook     0

Sorting

mean_region <- catch_long %>% 
  group_by(Region) %>% 
  summarise(mean_catch = mean(catch)) %>% 
  arrange(mean_catch)
head(mean_region)
## # A tibble: 6 × 2
##   Region mean_catch
##   <chr>       <dbl>
## 1 BER        16373.
## 2 KTZ        18836.
## 3 ALU        40384.
## 4 NRS        51503.
## 5 KSK        67642.
## 6 YUK        68646.
mean_region <- catch_long %>% 
  group_by(Region) %>% 
  summarise(mean_catch = mean(catch)) %>% 
  arrange(desc(mean_catch))
head(mean_region)
## # A tibble: 6 × 2
##   Region mean_catch
##   <chr>       <dbl>
## 1 SSE      3184661.
## 2 BRB      2709796.
## 3 NSE      1825021.
## 4 KOD      1528350 
## 5 PWS      1419237.
## 6 SOP      1110942.

Joins

region_defs <- read.csv("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1") %>% 
    select(code, mgmtArea)
head(region_defs)
##      code                                  mgmtArea
## 1     GSE              Unallocated Southeast Alaska
## 2     NSE                 Northern Southeast Alaska
## 3     SSE                 Southern Southeast Alaska
## 4     YAK                                   Yakutat
## 5 PWSmgmt      Prince William Sound Management Area
## 6     BER Bering River Subarea Copper River Subarea
catch_joined <- left_join(catch_long, region_defs, by = c("Region" = "code"))
head(catch_joined)
## # A tibble: 6 × 5
##   Region  Year species catch mgmtArea                 
##   <chr>  <int> <chr>   <dbl> <chr>                    
## 1 SSE     1886 Chinook     0 Southern Southeast Alaska
## 2 SSE     1886 Sockeye  5000 Southern Southeast Alaska
## 3 SSE     1886 Coho        0 Southern Southeast Alaska
## 4 SSE     1886 Pink        0 Southern Southeast Alaska
## 5 SSE     1886 Chum        0 Southern Southeast Alaska
## 6 SSE     1887 Chinook     0 Southern Southeast Alaska
region_defs <- region_defs %>% 
  rename(Region = code, Region_Name = mgmtArea)
catch_joined <- left_join(catch_long, region_defs, by = c("Region"))
head(catch_joined)
## # A tibble: 6 × 5
##   Region  Year species catch Region_Name              
##   <chr>  <int> <chr>   <dbl> <chr>                    
## 1 SSE     1886 Chinook     0 Southern Southeast Alaska
## 2 SSE     1886 Sockeye  5000 Southern Southeast Alaska
## 3 SSE     1886 Coho        0 Southern Southeast Alaska
## 4 SSE     1886 Pink        0 Southern Southeast Alaska
## 5 SSE     1886 Chum        0 Southern Southeast Alaska
## 6 SSE     1887 Chinook     0 Southern Southeast Alaska
sites_df <- data.frame(site = c("HAW-101",
                                "HAW-103",
                                "OAH-320",
                                "OAH-219",
                                "MAI-039"))
sites_df %>% 
  separate(site, c("island", "site_number"), "-")
##   island site_number
## 1    HAW         101
## 2    HAW         103
## 3    OAH         320
## 4    OAH         219
## 5    MAI         039
cities_df <- data.frame(city = c("Juneau AK", 
                                 "Sitka AK", 
                                 "Anchorage AK"))
cities_df %>% 
  separate(city, c("city", "state"), " ")
##        city state
## 1    Juneau    AK
## 2     Sitka    AK
## 3 Anchorage    AK
dates_df <- data.frame(year = c("1930",
                                "1930",
                                "1930"),
                       month = c("12",
                                "12",
                                "12"),
                       day = c("14",
                               "15",
                               "16"))
dates_df %>% 
  unite(date, year, month, day, sep = "-")
##         date
## 1 1930-12-14
## 2 1930-12-15
## 3 1930-12-16
cities_df <- data.frame(cities_df %>% 
  separate(city, c("city", "state"), " "))

#cities_df <- 
cities_df %>% 
  unite(city_state, city, state, sep=" ")
##     city_state
## 1    Juneau AK
## 2     Sitka AK
## 3 Anchorage AK
#cities_df

Final Summary

catch_original <- read.csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1", method = "libcurl"))
region_defs <- read.csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1", method = "libcurl")) %>% 
    select(code, mgmtArea)

mean_region <- catch_original %>%
  select(-All, -notesRegCode) %>% 
  mutate(Chinook = ifelse(Chinook == "I", 1, Chinook)) %>% 
  mutate(Chinook = as.numeric(Chinook)) %>% 
  pivot_longer(-c(Region, Year), names_to = "species", values_to = "catch") %>%
  mutate(catch = catch*1000) %>% 
  group_by(Region) %>% 
  summarize(mean_catch = mean(catch)) %>% 
  left_join(region_defs, by = c("Region" = "code"))

head(mean_region)
## # A tibble: 6 × 3
##   Region mean_catch mgmtArea                                 
##   <chr>       <dbl> <chr>                                    
## 1 ALU        40384. Aleutian Islands Subarea                 
## 2 BER        16373. Bering River Subarea Copper River Subarea
## 3 BRB      2709796. Bristol Bay Management Area              
## 4 CHG       315487. Chignik Management Area                  
## 5 CKI       683571. Cook Inlet Management Area               
## 6 COP       179223. Copper River Subarea