Initial commit with R functions

2020-06-12 16:44:42 -04:00 · 2020-06-12 16:44:42 -04:00 · d0e9c190a5
parent f7c32cb22f
commit d0e9c190a5
8 changed files with 206 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@

 .Rhistory
+.Rproj.user
+.Rbuildignore
--- a/12
+++ b/12
@ -0,0 +1,12 @@
+Package: matchcountry
+Title: Match Country
+Version: 0.0.0.9000
+Authors@R: person("Phillip", "Bastian", email = "pbastian@stern.nyu.edu", 
+    role = c("aut", "cre"), comment = "Research Scholar, NYU Stern")
+Description: Utility to simplify country name matching.
+License: MIT
+URL: https://www.github.com/philbastian/matchcountry
+Encoding: UTF-8
+LazyData: true
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.1.0
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+# Generated by roxygen2: do not edit by hand
+
+export(download.mc)
+export(match.country)
--- a/R/download.mc.R
+++ b/R/download.mc.R
@ -0,0 +1,25 @@
+#' Download the MatchCountry tables from Github
+#'
+#' @param match Path to the match csv file
+#' @param countrydata Path to the countrydata csv file
+#'
+#' This updates the Match Country files in the library to their current versions. It will overwrite
+#' the existing files, so this should be used with care if reproducability of older analyses is
+#' a priority. You can also use this to make your own custom files for matching.
+#'
+#' @examples
+#' download.mc()
+#' 
+#' @export
+download.mc <- function(match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv", 
+                        countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv") {
+  
+  i <- readLines(ii <- url(match), encoding = "UTF-8", warn = FALSE) 
+  j <- readLines(jj <- url(countrydata), encoding = "UTF-8", warn = FALSE)
+  
+  close(ii)
+  close(jj)
+  
+  writeLines(i, system.file("match.csv", "matchcountry"))
+  writeLines(j, system.file("countrydata.csv", "matchcountry"))
+}
--- a/R/match.country.R
+++ b/R/match.country.R
@ -0,0 +1,64 @@
+#' Match country
+#'
+#' @param country A vector of country names or ISO 3 digit alpha codes
+#' @param output A column of the MatchCountry table (mc) to return
+#' @param language The language of the inputs (by default, English; NULL will match on all)
+#' languages)
+#' @param match A csv file to use for matching
+#' @param countrydata A csv file to use
+#'
+#' @return A vector with the same length as country with the matching results
+#' @details This function is designed to recognize different variations of country names and 
+#' standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and 
+#' Nevis" all refer to the same place, but a simple merge with a table would fail to match them
+#' all. The database has a table of common alternative names. Further, the algorithm removes 
+#' extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
+#' cannot be found, NA is returned in its place.
+#' 
+#' If no language is specified, the algorithm will match on all languages, but this is less 
+#' efficient and could be prone to errors.
+#' 
+#' The default output is the iso field of the countrydata data frame; any column of the countrydata
+#' can be used, however.
+#'
+#' @examples
+#' 
+#' match.country("United Republic of Tanzania")
+#' match.country("Tanzania")
+#' 
+#' @export
+
+match.country <- function(country, output = "iso", language = "english",
+                          match = read.csv(system.file("match.csv", "matchcountry"), 
+                                           na.strings = "", stringsAsFactors = FALSE),
+                          countrydata = read.csv(system.file("countrydata.csv", "matchcountry"),
+                                                 na.strings = "", stringsAsFactors = FALSE)) {
+  removepunctuation <- function(input) {
+    replace <- c("&" = "AND", "SAINT" = "ST", "ISDS" = "ISLANDS", "REPUBLIC OF" = "")
+    
+    for(a in 1:length(replace)) input <- gsub(names(replace)[a], replace[a], toupper(input))
+    
+    gsub("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "", input)
+  }
+
+  m1 <- match
+  
+  if(!is.null(language)) {
+    m1 <- m1[toupper(match$language) == toupper(language),]
+  }
+  
+  m1$match <- removepunctuation(m1$name)
+  m1 <- m1[!duplicated(m1$match),]
+  
+  row.names(m1) <- m1$match
+  
+  isos <- m1[removepunctuation(country),]
+  
+  if(output != "iso") {
+    m2 <- countrydata[c("iso", output)]
+    row.names(m2) <- m2$iso
+    isos[[output]] <- m2[isos$iso, output]
+  }
+  
+  return(isos[[output]])
+}
--- a/man/download.mc.Rd
+++ b/man/download.mc.Rd
@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download.mc.R
+\name{download.mc}
+\alias{download.mc}
+\title{Download the MatchCountry tables from Github}
+\usage{
+download.mc(
+  match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv",
+ 
+    countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv"
+)
+}
+\arguments{
+\item{match}{Path to the match csv file}
+
+\item{countrydata}{Path to the countrydata csv file
+
+This updates the Match Country files in the library to their current versions. It will overwrite
+the existing files, so this should be used with care if reproducability of older analyses is
+a priority. You can also use this to make your own custom files for matching.}
+}
+\description{
+Download the MatchCountry tables from Github
+}
+\examples{
+download.mc()
+
+}
--- a/man/match.country.Rd
+++ b/man/match.country.Rd
@ -0,0 +1,54 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/match.country.R
+\name{match.country}
+\alias{match.country}
+\title{Match country}
+\usage{
+match.country(
+  country,
+  output = "iso",
+  language = "english",
+  match = read.csv(system.file("match.csv", "matchcountry"), na.strings = "",
+    stringsAsFactors = FALSE),
+  countrydata = read.csv(system.file("countrydata.csv", "matchcountry"), na.strings =
+    "", stringsAsFactors = FALSE)
+)
+}
+\arguments{
+\item{country}{A vector of country names or ISO 3 digit alpha codes}
+
+\item{output}{A column of the MatchCountry table (mc) to return}
+
+\item{language}{The language of the inputs (by default, English; NULL will match on all)
+languages)}
+
+\item{match}{A csv file to use for matching}
+
+\item{countrydata}{A csv file to use}
+}
+\value{
+A vector with the same length as country with the matching results
+}
+\description{
+Match country
+}
+\details{
+This function is designed to recognize different variations of country names and
+standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and
+Nevis" all refer to the same place, but a simple merge with a table would fail to match them
+all. The database has a table of common alternative names. Further, the algorithm removes
+extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
+cannot be found, NA is returned in its place.
+
+If no language is specified, the algorithm will match on all languages, but this is less
+efficient and could be prone to errors.
+
+The default output is the iso field of the countrydata data frame; any column of the countrydata
+can be used, however.
+}
+\examples{
+
+match.country("United Republic of Tanzania")
+match.country("Tanzania")
+
+}
--- a/matchcountry.Rproj
+++ b/matchcountry.Rproj
@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source