get_galah(
  aoi = NULL,
  save_dir = NULL,
  get_new = FALSE,
  name = "galah",
  data_map = NULL,
  node = "ALA",
  qry = NULL,
  check_rel_metres = TRUE,
  filter_inconsistent = TRUE,
  ...
)

Arguments

aoi

Optional simple feature (sf). Used to limit the occurrences returned via galah::galah_geolocate(). Note the limitations given in galah_geolocate, 'Shapefiles must be simple to be accepted by the ALA....If type = "polygon", WKT strings longer than 10000 characters and sf objects with more than 500 vertices will not be accepted by the ALA."

save_dir

Character. Path to directory into which to save outputs. If NULL results will be saved to here::here("out", "ds", "galah"). File will be named galah.parquet

get_new

Logical. If FALSE, will attempt to load from existing save_dir.

name

Character. data_name value in envImport::data_map (or other data_map)

data_map

Dataframe or NULL. Mapping of fields to retrieve. See example envImport::data_map

node

Character. Name of atlas to use (see galah::atlas_occurrences()). Doesn't seem to work with node = "GBIF" and untested on other nodes.

qry

NULL or an object of class data_request, created using galah::galah_call(). NOTE: do not include any galah::atlas_occurrences() in the qry, this is called by get_galah.

check_rel_metres

Logical. Ensure that coordinateUncertaintyInMetres is no less than generalisationInMetres? Only relevant if both columns are returned by qry

filter_inconsistent

Logical. If TRUE, inconsistencies between the occurrenceStatus column and either organismQuantity are filtered (removed). e.g. a record with occurrenceStatus == "ABSENT" but organismQuantity == 10 would be filtered. Only relevant if both columns are returned by qry

...

Passed to envImport::file_prep() and envImport::remap_data_names()

Value

Dataframe of occurrences and file saved to save_dir. .bib created when download_reason_id != 10.

Examples


  # library("envImport")

  out_dir <- file.path(system.file(package = "envImport"), "examples")

  ## config -------
  old_atlas <- galah::galah_config()$atlas$region

  galah::galah_config(email = Sys.getenv("GBIF_email")
                      , username = Sys.getenv("GBIF_user")
                      , password = Sys.getenv("GBIF_pwd")
                      , caching = TRUE
                      , download_reason_id = 10 # testing
                      )

  galah::galah_config(atlas = "GBIF")
#> Atlas selected: Global Biodiversity Information Facility (GBIF) [Global]


  # Australian Bustards--------
    # in the year 2020

  ## 01: atlas = gbif --------

  save_file <- fs::path(out_dir, "qry01", "qry01.rds")

  if(!file.exists(save_file)) {

    qry01 <- galah::galah_call() %>%
      galah::galah_identify("Ardeotis australis") %>%
      galah::galah_filter(year == 2000) %>%
      galah::atlas_occurrences() %>%
      dplyr::collect()

    rio::export(qry01
                , save_file
                )

  } else {

    qry01 <- rio::import(save_file)

  }


  ## 02: atlas = ala ----------
  galah::galah_config(atlas = "ALA")
#> Atlas selected: Atlas of Living Australia (ALA) [Australia]

  galah::galah_config(email = Sys.getenv("ALA_email"))

  # 'qry' used for both qry02 and qry03
  qry <- galah::galah_call() %>%
    galah::galah_identify("Ardeotis australis") %>%
    galah::galah_filter(year == 2000)

  save_file <- fs::path(out_dir, "qry02", "qry02.rds")

  if(!file.exists(save_file)) {

    qry02 <- qry %>%
      galah::atlas_occurrences()

    rio::export(qry02
                , save_file
                )

  } else {

    qry02 <- rio::import(save_file
                         , setclass = "tibble"
                         )

  }

  # similar (but not identical) # of records
  nrow(qry01)
#> [1] 856
  nrow(qry02)
#> [1] 857


  ## 03: get_galah ---------

  qry03 <- get_galah(save_dir = fs::path(out_dir, "qry03")
                     , data_map = data_map
                     , qry = qry
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry03/galah/galah.parquet

  # again, not quite the same number of records
  nrow(qry02)
#> [1] 857
  nrow(qry03)
#> [1] 852

  # get_galah removes, via envImport::remap_data_names NULL dates, lat and long
    # see arguments to envImport::remap_data_names
    # filtering qry02 on those columns gives the same result as qry03
  qry02 %>%
    dplyr::filter(!is.na(eventDate)
                  , !is.na(decimalLatitude)
                  , !is.na(decimalLongitude)
                  ) %>%
    nrow()
#> [1] 852

  # names from data_map
  names(qry02)
#> [1] "recordID"         "scientificName"   "taxonConceptID"   "decimalLatitude" 
#> [5] "decimalLongitude" "eventDate"        "occurrenceStatus" "dataResourceName"
  names(qry03)
#>  [1] "data_name"      "site"           "date"           "lat"           
#>  [5] "long"           "original_name"  "common"         "nsx"           
#>  [9] "occ_derivation" "quantity"       "survey"         "rel_metres"    
#> [13] "method"         "obs"            "denatured"      "kingdom"       
#> [17] "occ"            "year"           "month"         

  ## 04: get_galah with profile -------

  qry04 <- get_galah(save_dir = fs::path(out_dir, "qry04")
                     , data_map = data_map
                     , qry = qry %>%
                       galah::apply_profile(CSDM)
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry04/galah/galah.parquet

  # lost some records due to the profile
  nrow(qry04)
#> [1] 715


  ############################################

  # Combine data --------
  ## get_galah for aoi -------
  bio_all_galah <- get_galah(aoi = envImport::aoi
                             , save_dir = out_dir
                             , data_map = data_map
                             , sub_dir = "bio_all"
                             )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet

  ## get_tern for aoi --------
  bio_all_tern <- get_tern(aoi = envImport::aoi
                           , save_dir = out_dir
                           , data_map = data_map
                           , sub_dir = "bio_all"
                           )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet

  ## or using get_data -------
  # to get both galah and tern
  datas <- c("galah", "tern", "gbif")

  # galah and tern already run from above

  temp <- purrr::map(datas
                     , \(x) get_data(x
                                     , save_dir = out_dir
                                     , get_new = FALSE
                                     , aoi = envImport::aoi
                                     , data_map = data_map
                                     , sub_dir = "bio_all"
                                     , previous_key = "0057643-240626123714530"
                                     )
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/gbif.parquet

  ## single dataset --------
  bio_all_files <- fs::dir_ls(fs::path(out_dir, "bio_all")
                              , regexp = "\\.parquet"
                              )

  bio_all <- purrr::map_dfr(bio_all_files, \(x) rio::import(x))


  if(FALSE) {

    # check for misaligned classes
    check <- purrr::map_dfr(temp
                            , \(x) purrr::map(x, class)
                            ) %>%
      purrr::map_dfr(\(x) length(unique(na.omit(x))) == 1) %>%
      tidyr::pivot_longer(everything()) %>%
      dplyr::filter(!value)

    use_schema <- arrow::schema(bio_all)

    use_schema$quantity <- arrow::Field$create("quantity", arrow::string())

    bio_all <- arrow::open_dataset(bio_all_files
                                   , schema = use_schema
                                   ) %>%
      dplyr::collect()

  }


  # 'bio_all' is now the sum of its components
  nrow(bio_all) == sum(purrr::map_dbl(temp, nrow))
#> [1] TRUE

  # clean up -------
  # return to original atlas
  galah::galah_config(atlas = old_atlas)