Import data, running get_data_name
to requery original data source,
if get_new
. Data is saved to (and imported from)
file.path(save_dir, data_name, "paste0(data_name,"_raw.rds"))
. save_dir
and get_new
can be passed via ...
to get_data_name
, otherwise, default
values from get_data_name
are used
(respectively: FALSE
and
here::here("out", "ds", data_name, paste0(data_name, "_raw.rds"))
)
get_data(data_name, ...)
Character. Name of data source. e.g. 'tern' or 'galah'.
Passed to get_data_name
Dataframe, either loaded from save_dir
or from a new query to
data_name
. If new data is queried, .rds results file will be created,
overwriting if necessary. Timing and number of records log in save_dir
.
Other Help with combining data sources:
remap_data_names()
# library("envImport")
out_dir <- file.path(system.file(package = "envImport"), "examples")
## config -------
old_atlas <- galah::galah_config()$atlas$region
galah::galah_config(email = Sys.getenv("GBIF_email")
, username = Sys.getenv("GBIF_user")
, password = Sys.getenv("GBIF_pwd")
, caching = TRUE
, download_reason_id = 10 # testing
)
galah::galah_config(atlas = "GBIF")
#> Atlas selected: Global Biodiversity Information Facility (GBIF) [Global]
# Australian Bustards--------
# in the year 2020
## 01: atlas = gbif --------
save_file <- fs::path(out_dir, "qry01", "qry01.rds")
if(!file.exists(save_file)) {
qry01 <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000) %>%
galah::atlas_occurrences() %>%
dplyr::collect()
rio::export(qry01
, save_file
)
} else {
qry01 <- rio::import(save_file)
}
## 02: atlas = ala ----------
galah::galah_config(atlas = "ALA")
#> Atlas selected: Atlas of Living Australia (ALA) [Australia]
galah::galah_config(email = Sys.getenv("ALA_email"))
# 'qry' used for both qry02 and qry03
qry <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000)
save_file <- fs::path(out_dir, "qry02", "qry02.rds")
if(!file.exists(save_file)) {
qry02 <- qry %>%
galah::atlas_occurrences()
rio::export(qry02
, save_file
)
} else {
qry02 <- rio::import(save_file
, setclass = "tibble"
)
}
# similar (but not identical) # of records
nrow(qry01)
#> [1] 856
nrow(qry02)
#> [1] 857
## 03: get_galah ---------
qry03 <- get_galah(save_dir = fs::path(out_dir, "qry03")
, data_map = data_map
, qry = qry
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry03/galah/galah.parquet
# again, not quite the same number of records
nrow(qry02)
#> [1] 857
nrow(qry03)
#> [1] 852
# get_galah removes, via envImport::remap_data_names NULL dates, lat and long
# see arguments to envImport::remap_data_names
# filtering qry02 on those columns gives the same result as qry03
qry02 %>%
dplyr::filter(!is.na(eventDate)
, !is.na(decimalLatitude)
, !is.na(decimalLongitude)
) %>%
nrow()
#> [1] 852
# names from data_map
names(qry02)
#> [1] "recordID" "scientificName" "taxonConceptID" "decimalLatitude"
#> [5] "decimalLongitude" "eventDate" "occurrenceStatus" "dataResourceName"
names(qry03)
#> [1] "data_name" "site" "date" "lat"
#> [5] "long" "original_name" "common" "nsx"
#> [9] "occ_derivation" "quantity" "survey" "rel_metres"
#> [13] "method" "obs" "denatured" "kingdom"
#> [17] "occ" "year" "month"
## 04: get_galah with profile -------
qry04 <- get_galah(save_dir = fs::path(out_dir, "qry04")
, data_map = data_map
, qry = qry %>%
galah::apply_profile(CSDM)
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry04/galah/galah.parquet
# lost some records due to the profile
nrow(qry04)
#> [1] 715
############################################
# Combine data --------
## get_galah for aoi -------
bio_all_galah <- get_galah(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
## get_tern for aoi --------
bio_all_tern <- get_tern(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
## or using get_data -------
# to get both galah and tern
datas <- c("galah", "tern", "gbif")
# galah and tern already run from above
temp <- purrr::map(datas
, \(x) get_data(x
, save_dir = out_dir
, get_new = FALSE
, aoi = envImport::aoi
, data_map = data_map
, sub_dir = "bio_all"
, previous_key = "0057643-240626123714530"
)
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/gbif.parquet
## single dataset --------
bio_all_files <- fs::dir_ls(fs::path(out_dir, "bio_all")
, regexp = "\\.parquet"
)
bio_all <- purrr::map_dfr(bio_all_files, \(x) rio::import(x))
if(FALSE) {
# check for misaligned classes
check <- purrr::map_dfr(temp
, \(x) purrr::map(x, class)
) %>%
purrr::map_dfr(\(x) length(unique(na.omit(x))) == 1) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::filter(!value)
use_schema <- arrow::schema(bio_all)
use_schema$quantity <- arrow::Field$create("quantity", arrow::string())
bio_all <- arrow::open_dataset(bio_all_files
, schema = use_schema
) %>%
dplyr::collect()
}
# 'bio_all' is now the sum of its components
nrow(bio_all) == sum(purrr::map_dbl(temp, nrow))
#> [1] TRUE
# clean up -------
# return to original atlas
galah::galah_config(atlas = old_atlas)