Get data using galah::atlas_occurrences()
get_galah(
aoi = NULL,
save_dir = NULL,
get_new = FALSE,
name = "galah",
data_map = NULL,
node = "ALA",
qry = NULL,
check_rel_metres = TRUE,
filter_inconsistent = TRUE,
...
)
Optional simple feature (sf). Used to limit the occurrences
returned via galah::galah_geolocate()
. Note the limitations given in
galah_geolocate
, 'Shapefiles must be simple to be accepted by the ALA....If
type = "polygon", WKT strings longer than 10000 characters and sf objects
with more than 500 vertices will not be accepted by the ALA."
Character. Path to directory into which to save outputs. If
NULL
results will be saved to here::here("out", "ds", "galah")
. File will
be named galah.parquet
Logical. If FALSE, will attempt to load from existing
save_dir
.
Character. data_name
value in envImport::data_map
(or other data_map
)
Dataframe or NULL. Mapping of fields to retrieve. See example
envImport::data_map
Character. Name of atlas to use (see galah::atlas_occurrences()
).
Doesn't seem to work with node = "GBIF" and untested on other nodes.
NULL
or an object of class data_request, created using
galah::galah_call()
. NOTE: do not include any galah::atlas_occurrences()
in the qry
, this is called by get_galah.
Logical. Ensure that coordinateUncertaintyInMetres
is no less than generalisationInMetres
? Only relevant if both columns are
returned by qry
Logical. If TRUE
, inconsistencies between the
occurrenceStatus
column and either organismQuantity
are filtered
(removed). e.g. a record with occurrenceStatus == "ABSENT"
but
organismQuantity == 10
would be filtered. Only relevant if both columns are
returned by qry
Passed to envImport::file_prep()
and
envImport::remap_data_names()
Dataframe of occurrences and file saved to save_dir
. .bib created
when download_reason_id != 10.
# library("envImport")
out_dir <- file.path(system.file(package = "envImport"), "examples")
## config -------
old_atlas <- galah::galah_config()$atlas$region
galah::galah_config(email = Sys.getenv("GBIF_email")
, username = Sys.getenv("GBIF_user")
, password = Sys.getenv("GBIF_pwd")
, caching = TRUE
, download_reason_id = 10 # testing
)
galah::galah_config(atlas = "GBIF")
#> Atlas selected: Global Biodiversity Information Facility (GBIF) [Global]
# Australian Bustards--------
# in the year 2020
## 01: atlas = gbif --------
save_file <- fs::path(out_dir, "qry01", "qry01.rds")
if(!file.exists(save_file)) {
qry01 <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000) %>%
galah::atlas_occurrences() %>%
dplyr::collect()
rio::export(qry01
, save_file
)
} else {
qry01 <- rio::import(save_file)
}
## 02: atlas = ala ----------
galah::galah_config(atlas = "ALA")
#> Atlas selected: Atlas of Living Australia (ALA) [Australia]
galah::galah_config(email = Sys.getenv("ALA_email"))
# 'qry' used for both qry02 and qry03
qry <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000)
save_file <- fs::path(out_dir, "qry02", "qry02.rds")
if(!file.exists(save_file)) {
qry02 <- qry %>%
galah::atlas_occurrences()
rio::export(qry02
, save_file
)
} else {
qry02 <- rio::import(save_file
, setclass = "tibble"
)
}
# similar (but not identical) # of records
nrow(qry01)
#> [1] 856
nrow(qry02)
#> [1] 857
## 03: get_galah ---------
qry03 <- get_galah(save_dir = fs::path(out_dir, "qry03")
, data_map = data_map
, qry = qry
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry03/galah/galah.parquet
# again, not quite the same number of records
nrow(qry02)
#> [1] 857
nrow(qry03)
#> [1] 852
# get_galah removes, via envImport::remap_data_names NULL dates, lat and long
# see arguments to envImport::remap_data_names
# filtering qry02 on those columns gives the same result as qry03
qry02 %>%
dplyr::filter(!is.na(eventDate)
, !is.na(decimalLatitude)
, !is.na(decimalLongitude)
) %>%
nrow()
#> [1] 852
# names from data_map
names(qry02)
#> [1] "recordID" "scientificName" "taxonConceptID" "decimalLatitude"
#> [5] "decimalLongitude" "eventDate" "occurrenceStatus" "dataResourceName"
names(qry03)
#> [1] "data_name" "site" "date" "lat"
#> [5] "long" "original_name" "common" "nsx"
#> [9] "occ_derivation" "quantity" "survey" "rel_metres"
#> [13] "method" "obs" "denatured" "kingdom"
#> [17] "occ" "year" "month"
## 04: get_galah with profile -------
qry04 <- get_galah(save_dir = fs::path(out_dir, "qry04")
, data_map = data_map
, qry = qry %>%
galah::apply_profile(CSDM)
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry04/galah/galah.parquet
# lost some records due to the profile
nrow(qry04)
#> [1] 715
############################################
# Combine data --------
## get_galah for aoi -------
bio_all_galah <- get_galah(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
## get_tern for aoi --------
bio_all_tern <- get_tern(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
## or using get_data -------
# to get both galah and tern
datas <- c("galah", "tern", "gbif")
# galah and tern already run from above
temp <- purrr::map(datas
, \(x) get_data(x
, save_dir = out_dir
, get_new = FALSE
, aoi = envImport::aoi
, data_map = data_map
, sub_dir = "bio_all"
, previous_key = "0057643-240626123714530"
)
)
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/gbif.parquet
## single dataset --------
bio_all_files <- fs::dir_ls(fs::path(out_dir, "bio_all")
, regexp = "\\.parquet"
)
bio_all <- purrr::map_dfr(bio_all_files, \(x) rio::import(x))
if(FALSE) {
# check for misaligned classes
check <- purrr::map_dfr(temp
, \(x) purrr::map(x, class)
) %>%
purrr::map_dfr(\(x) length(unique(na.omit(x))) == 1) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::filter(!value)
use_schema <- arrow::schema(bio_all)
use_schema$quantity <- arrow::Field$create("quantity", arrow::string())
bio_all <- arrow::open_dataset(bio_all_files
, schema = use_schema
) %>%
dplyr::collect()
}
# 'bio_all' is now the sum of its components
nrow(bio_all) == sum(purrr::map_dbl(temp, nrow))
#> [1] TRUE
# clean up -------
# return to original atlas
galah::galah_config(atlas = old_atlas)