Import data, running get_data_name to requery original data source, if get_new. Data is saved to (and imported from) file.path(save_dir, data_name, "paste0(data_name,"_raw.rds")). save_dir and get_new can be passed via ... to get_data_name, otherwise, default values from get_data_name are used (respectively: FALSE and here::here("out", "ds", data_name, paste0(data_name, "_raw.rds")))

get_data(data_name, ...)

Arguments

data_name

Character. Name of data source. e.g. 'tern' or 'galah'.

...

Passed to get_data_name

Value

Dataframe, either loaded from save_dir or from a new query to data_name. If new data is queried, .rds results file will be created, overwriting if necessary. Timing and number of records log in save_dir.

See also

Other Help with combining data sources: remap_data_names()

Examples


  # library("envImport")

  out_dir <- file.path(system.file(package = "envImport"), "examples")

  ## config -------
  old_atlas <- galah::galah_config()$atlas$region

  galah::galah_config(email = Sys.getenv("GBIF_email")
                      , username = Sys.getenv("GBIF_user")
                      , password = Sys.getenv("GBIF_pwd")
                      , caching = TRUE
                      , download_reason_id = 10 # testing
                      )

  galah::galah_config(atlas = "GBIF")
#> Atlas selected: Global Biodiversity Information Facility (GBIF) [Global]


  # Australian Bustards--------
    # in the year 2020

  ## 01: atlas = gbif --------

  save_file <- fs::path(out_dir, "qry01", "qry01.rds")

  if(!file.exists(save_file)) {

    qry01 <- galah::galah_call() %>%
      galah::galah_identify("Ardeotis australis") %>%
      galah::galah_filter(year == 2000) %>%
      galah::atlas_occurrences() %>%
      dplyr::collect()

    rio::export(qry01
                , save_file
                )

  } else {

    qry01 <- rio::import(save_file)

  }


  ## 02: atlas = ala ----------
  galah::galah_config(atlas = "ALA")
#> Atlas selected: Atlas of Living Australia (ALA) [Australia]

  galah::galah_config(email = Sys.getenv("ALA_email"))

  # 'qry' used for both qry02 and qry03
  qry <- galah::galah_call() %>%
    galah::galah_identify("Ardeotis australis") %>%
    galah::galah_filter(year == 2000)

  save_file <- fs::path(out_dir, "qry02", "qry02.rds")

  if(!file.exists(save_file)) {

    qry02 <- qry %>%
      galah::atlas_occurrences()

    rio::export(qry02
                , save_file
                )

  } else {

    qry02 <- rio::import(save_file
                         , setclass = "tibble"
                         )

  }

  # similar (but not identical) # of records
  nrow(qry01)
#> [1] 856
  nrow(qry02)
#> [1] 857


  ## 03: get_galah ---------

  qry03 <- get_galah(save_dir = fs::path(out_dir, "qry03")
                     , data_map = data_map
                     , qry = qry
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry03/galah/galah.parquet

  # again, not quite the same number of records
  nrow(qry02)
#> [1] 857
  nrow(qry03)
#> [1] 852

  # get_galah removes, via envImport::remap_data_names NULL dates, lat and long
    # see arguments to envImport::remap_data_names
    # filtering qry02 on those columns gives the same result as qry03
  qry02 %>%
    dplyr::filter(!is.na(eventDate)
                  , !is.na(decimalLatitude)
                  , !is.na(decimalLongitude)
                  ) %>%
    nrow()
#> [1] 852

  # names from data_map
  names(qry02)
#> [1] "recordID"         "scientificName"   "taxonConceptID"   "decimalLatitude" 
#> [5] "decimalLongitude" "eventDate"        "occurrenceStatus" "dataResourceName"
  names(qry03)
#>  [1] "data_name"      "site"           "date"           "lat"           
#>  [5] "long"           "original_name"  "common"         "nsx"           
#>  [9] "occ_derivation" "quantity"       "survey"         "rel_metres"    
#> [13] "method"         "obs"            "denatured"      "kingdom"       
#> [17] "occ"            "year"           "month"         

  ## 04: get_galah with profile -------

  qry04 <- get_galah(save_dir = fs::path(out_dir, "qry04")
                     , data_map = data_map
                     , qry = qry %>%
                       galah::apply_profile(CSDM)
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/qry04/galah/galah.parquet

  # lost some records due to the profile
  nrow(qry04)
#> [1] 715


  ############################################

  # Combine data --------
  ## get_galah for aoi -------
  bio_all_galah <- get_galah(aoi = envImport::aoi
                             , save_dir = out_dir
                             , data_map = data_map
                             , sub_dir = "bio_all"
                             )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet

  ## get_tern for aoi --------
  bio_all_tern <- get_tern(aoi = envImport::aoi
                           , save_dir = out_dir
                           , data_map = data_map
                           , sub_dir = "bio_all"
                           )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet

  ## or using get_data -------
  # to get both galah and tern
  datas <- c("galah", "tern", "gbif")

  # galah and tern already run from above

  temp <- purrr::map(datas
                     , \(x) get_data(x
                                     , save_dir = out_dir
                                     , get_new = FALSE
                                     , aoi = envImport::aoi
                                     , data_map = data_map
                                     , sub_dir = "bio_all"
                                     , previous_key = "0057643-240626123714530"
                                     )
                     )
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/galah.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/tern.parquet
#> save_file will be H:/temp/nige/Rtmp8irTbY/temp_libpath4698bd41ce/envImport/examples/bio_all/gbif.parquet

  ## single dataset --------
  bio_all_files <- fs::dir_ls(fs::path(out_dir, "bio_all")
                              , regexp = "\\.parquet"
                              )

  bio_all <- purrr::map_dfr(bio_all_files, \(x) rio::import(x))


  if(FALSE) {

    # check for misaligned classes
    check <- purrr::map_dfr(temp
                            , \(x) purrr::map(x, class)
                            ) %>%
      purrr::map_dfr(\(x) length(unique(na.omit(x))) == 1) %>%
      tidyr::pivot_longer(everything()) %>%
      dplyr::filter(!value)

    use_schema <- arrow::schema(bio_all)

    use_schema$quantity <- arrow::Field$create("quantity", arrow::string())

    bio_all <- arrow::open_dataset(bio_all_files
                                   , schema = use_schema
                                   ) %>%
      dplyr::collect()

  }


  # 'bio_all' is now the sum of its components
  nrow(bio_all) == sum(purrr::map_dbl(temp, nrow))
#> [1] TRUE

  # clean up -------
  # return to original atlas
  galah::galah_config(atlas = old_atlas)