m <- stations %>% select(station_name, interval, start, end, normals) %>% mutate(nmonths = (end - start) * 12) m
## # A tibble: 26,316 x 6
## station_name interval start end normals nmonths
## <chr> <chr> <dbl> <dbl> <lgl> <dbl>
## 1 DAYSLAND day 1908 1922 FALSE 168
## 2 DAYSLAND hour NA NA FALSE NA
## 3 DAYSLAND month 1908 1922 FALSE 168
## 4 EDMONTON CORONATION day 1978 1979 FALSE 12
## 5 EDMONTON CORONATION hour NA NA FALSE NA
## 6 EDMONTON CORONATION month 1978 1979 FALSE 12
## 7 FLEET day 1987 1990 FALSE 36
## 8 FLEET hour NA NA FALSE NA
## 9 FLEET month 1987 1990 FALSE 36
## 10 GOLDEN VALLEY day 1987 1998 FALSE 132
## # … with 26,306 more rows
sum(m$nmonths, na.rm = TRUE)/12
## [1] 402648
skim(m)
Name | m |
Number of rows | 26316 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
character | 2 |
logical | 1 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
station_name | 0 | 1 | 3 | 51 | 0 | 7947 | 0 |
interval | 0 | 1 | 3 | 5 | 0 | 3 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
normals | 0 | 1 | 0.08 | FAL: 24153, TRU: 2163 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
start | 8160 | 0.69 | 1967.90 | 30.00 | 1840 | 1956 | 1970 | 1989 | 2020 | ▁▁▂▇▅ |
end | 8160 | 0.69 | 1990.07 | 23.66 | 1878 | 1974 | 1993 | 2007 | 2021 | ▁▁▂▆▇ |
nmonths | 8160 | 0.69 | 266.13 | 283.27 | 0 | 60 | 168 | 372 | 2124 | ▇▂▁▁▁ |
filter(stations, start < 1850) %>% as.data.frame()
## prov station_name station_id climate_id WMO_id TC_id lat lon elev
## 1 ON TORONTO 5051 6158350 71266 <NA> 43.67 -79.4 112.5
## 2 ON TORONTO 5051 6158350 71266 <NA> 43.67 -79.4 112.5
## tz interval start end normals
## 1 Etc/GMT+5 day 1840 2017 TRUE
## 2 Etc/GMT+5 month 1840 2006 TRUE
## [1] 365
stations_search("Winnipeg", starts_latest = 2000, ends_earliest = 2000)
## # A tibble: 14 x 14
## prov station_name station_id climate_id WMO_id TC_id lat lon elev tz
## <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
## 1 MB WINNIPEG A … 27174 502S001 71849 XWG 49.9 -97.2 239. Etc/…
## 2 MB WINNIPEG A … 27174 502S001 71849 XWG 49.9 -97.2 239. Etc/…
## 3 MB WINNIPEG A … 27525 5023223 NA <NA> 49.9 -97.2 239. Etc/…
## 4 MB WINNIPEG A … 27525 5023223 NA <NA> 49.9 -97.2 239. Etc/…
## 5 MB WINNIPEG RI… 3698 5023222 71852 YWG 49.9 -97.2 239. Etc/…
## 6 MB WINNIPEG RI… 3698 5023222 71852 YWG 49.9 -97.2 239. Etc/…
## 7 MB WINNIPEG RI… 3698 5023222 71852 YWG 49.9 -97.2 239. Etc/…
## 8 MB WINNIPEG SO… 27230 502M001 NA <NA> 49.8 -97.1 232 Etc/…
## 9 MB WINNIPEG SO… 27230 502M001 NA <NA> 49.8 -97.1 232 Etc/…
## 10 MB WINNIPEG TH… 28051 5023262 71579 XWN 49.9 -97.1 230 Etc/…
## 11 MB WINNIPEG TH… 28051 5023262 71579 XWN 49.9 -97.1 230 Etc/…
## 12 MB WINNIPEG TH… 28051 5023262 71579 XWN 49.9 -97.1 230 Etc/…
## 13 MB WINNIPEGOSIS 3854 5043225 NA <NA> 51.6 -99.9 258. Etc/…
## 14 MB WINNIPEGOSIS 3854 5043225 NA <NA> 51.6 -99.9 258. Etc/…
## # … with 4 more variables: interval <chr>, start <dbl>, end <dbl>,
## # normals <lgl>
s <- c(3698, 28051)
How much data for hours in a month?
d <- weather_dl(s, start = "2000-01-01", end = "2000-01-31")
## As of weathercan v0.3.0 time display is either local time or UTC
## See Details under ?weather_dl for more information.
## This message is shown once per session
size_hour <- object_size(d) / length(s) # Average Per station size_hour
## 210 kB
How much data for days in a month?
d <- weather_dl(s, start = "2000-01-01", end = "2000-01-31", interval = "day") size_day <- object_size(d) / length(s) # Average Per station size_day
## 13.1 kB
How much data for months in a year?
d <- weather_dl(s, start = "2000-01-01", end = "2000-12-31", interval = "month") size_month <- object_size(d) / length(s) # Average Per station size_month
## 6.85 kB
How much data?
total <- m %>% mutate(size = case_when(interval == "hour" ~ nmonths * size_hour, interval == "day" ~ nmonths * size_day, interval == "month" ~ nmonths/12 * size_month)) %>% pull(size) %>% sum(na.rm = TRUE) total
## [1] 122931749776
## [1] "114.5 Gb"
n <- html_session(file.path(getOption("weathercan.urls.normals"), "1981-2010")) size <- list() for(p in weathercan:::province) { size[[p]] <- follow_link(n, p) %>% html_nodes(css = "pre") %>% html_text() %>% str_extract_all("[0-9K.]{1,5}[ ]*\n", simplify = TRUE) %>% str_extract_all("[0-9K.]{1,5}", simplify = TRUE) %>% as.vector() }
## Navigating to AB/
## Navigating to BC/
## Navigating to MB/
## Navigating to NB/
## Navigating to NL/
## Navigating to NT/
## Navigating to NS/
## Navigating to NU/
## Navigating to ON/
## Navigating to PE/
## Navigating to QC/
## Navigating to SK/
## Navigating to YT/
tibble(char = unlist(size, use.names = FALSE)) %>% mutate(kilo = str_detect(char, "K"), size = as.numeric(str_remove(char, "K")), size = if_else(kilo, size * 1000, size)) %>% pull(size) %>% sum() %>% structure(., class = "object_size") %>% format(., units = "auto")
## [1] "5.7 Mb"