Initial commit with R functions

This commit is contained in:
philbastian 2020-06-12 16:44:42 -04:00
parent f7c32cb22f
commit d0e9c190a5
8 changed files with 206 additions and 0 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
.Rhistory
.Rproj.user
.Rbuildignore

12
DESCRIPTION Normal file
View File

@ -0,0 +1,12 @@
Package: matchcountry
Title: Match Country
Version: 0.0.0.9000
Authors@R: person("Phillip", "Bastian", email = "pbastian@stern.nyu.edu",
role = c("aut", "cre"), comment = "Research Scholar, NYU Stern")
Description: Utility to simplify country name matching.
License: MIT
URL: https://www.github.com/philbastian/matchcountry
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0

4
NAMESPACE Normal file
View File

@ -0,0 +1,4 @@
# Generated by roxygen2: do not edit by hand
export(download.mc)
export(match.country)

25
R/download.mc.R Normal file
View File

@ -0,0 +1,25 @@
#' Download the MatchCountry tables from Github
#'
#' @param match Path to the match csv file
#' @param countrydata Path to the countrydata csv file
#'
#' This updates the Match Country files in the library to their current versions. It will overwrite
#' the existing files, so this should be used with care if reproducability of older analyses is
#' a priority. You can also use this to make your own custom files for matching.
#'
#' @examples
#' download.mc()
#'
#' @export
download.mc <- function(match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv",
countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv") {
i <- readLines(ii <- url(match), encoding = "UTF-8", warn = FALSE)
j <- readLines(jj <- url(countrydata), encoding = "UTF-8", warn = FALSE)
close(ii)
close(jj)
writeLines(i, system.file("match.csv", "matchcountry"))
writeLines(j, system.file("countrydata.csv", "matchcountry"))
}

64
R/match.country.R Normal file
View File

@ -0,0 +1,64 @@
#' Match country
#'
#' @param country A vector of country names or ISO 3 digit alpha codes
#' @param output A column of the MatchCountry table (mc) to return
#' @param language The language of the inputs (by default, English; NULL will match on all)
#' languages)
#' @param match A csv file to use for matching
#' @param countrydata A csv file to use
#'
#' @return A vector with the same length as country with the matching results
#' @details This function is designed to recognize different variations of country names and
#' standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and
#' Nevis" all refer to the same place, but a simple merge with a table would fail to match them
#' all. The database has a table of common alternative names. Further, the algorithm removes
#' extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
#' cannot be found, NA is returned in its place.
#'
#' If no language is specified, the algorithm will match on all languages, but this is less
#' efficient and could be prone to errors.
#'
#' The default output is the iso field of the countrydata data frame; any column of the countrydata
#' can be used, however.
#'
#' @examples
#'
#' match.country("United Republic of Tanzania")
#' match.country("Tanzania")
#'
#' @export
match.country <- function(country, output = "iso", language = "english",
match = read.csv(system.file("match.csv", "matchcountry"),
na.strings = "", stringsAsFactors = FALSE),
countrydata = read.csv(system.file("countrydata.csv", "matchcountry"),
na.strings = "", stringsAsFactors = FALSE)) {
removepunctuation <- function(input) {
replace <- c("&" = "AND", "SAINT" = "ST", "ISDS" = "ISLANDS", "REPUBLIC OF" = "")
for(a in 1:length(replace)) input <- gsub(names(replace)[a], replace[a], toupper(input))
gsub("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "", input)
}
m1 <- match
if(!is.null(language)) {
m1 <- m1[toupper(match$language) == toupper(language),]
}
m1$match <- removepunctuation(m1$name)
m1 <- m1[!duplicated(m1$match),]
row.names(m1) <- m1$match
isos <- m1[removepunctuation(country),]
if(output != "iso") {
m2 <- countrydata[c("iso", output)]
row.names(m2) <- m2$iso
isos[[output]] <- m2[isos$iso, output]
}
return(isos[[output]])
}

28
man/download.mc.Rd Normal file
View File

@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/download.mc.R
\name{download.mc}
\alias{download.mc}
\title{Download the MatchCountry tables from Github}
\usage{
download.mc(
match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv",
countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv"
)
}
\arguments{
\item{match}{Path to the match csv file}
\item{countrydata}{Path to the countrydata csv file
This updates the Match Country files in the library to their current versions. It will overwrite
the existing files, so this should be used with care if reproducability of older analyses is
a priority. You can also use this to make your own custom files for matching.}
}
\description{
Download the MatchCountry tables from Github
}
\examples{
download.mc()
}

54
man/match.country.Rd Normal file
View File

@ -0,0 +1,54 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/match.country.R
\name{match.country}
\alias{match.country}
\title{Match country}
\usage{
match.country(
country,
output = "iso",
language = "english",
match = read.csv(system.file("match.csv", "matchcountry"), na.strings = "",
stringsAsFactors = FALSE),
countrydata = read.csv(system.file("countrydata.csv", "matchcountry"), na.strings =
"", stringsAsFactors = FALSE)
)
}
\arguments{
\item{country}{A vector of country names or ISO 3 digit alpha codes}
\item{output}{A column of the MatchCountry table (mc) to return}
\item{language}{The language of the inputs (by default, English; NULL will match on all)
languages)}
\item{match}{A csv file to use for matching}
\item{countrydata}{A csv file to use}
}
\value{
A vector with the same length as country with the matching results
}
\description{
Match country
}
\details{
This function is designed to recognize different variations of country names and
standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and
Nevis" all refer to the same place, but a simple merge with a table would fail to match them
all. The database has a table of common alternative names. Further, the algorithm removes
extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
cannot be found, NA is returned in its place.
If no language is specified, the algorithm will match on all languages, but this is less
efficient and could be prone to errors.
The default output is the iso field of the countrydata data frame; any column of the countrydata
can be used, however.
}
\examples{
match.country("United Republic of Tanzania")
match.country("Tanzania")
}

17
matchcountry.Rproj Normal file
View File

@ -0,0 +1,17 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source