Initial commit with R functions
This commit is contained in:
parent
f7c32cb22f
commit
d0e9c190a5
|
@ -1,2 +1,4 @@
|
|||
|
||||
.Rhistory
|
||||
.Rproj.user
|
||||
.Rbuildignore
|
|
@ -0,0 +1,12 @@
|
|||
Package: matchcountry
|
||||
Title: Match Country
|
||||
Version: 0.0.0.9000
|
||||
Authors@R: person("Phillip", "Bastian", email = "pbastian@stern.nyu.edu",
|
||||
role = c("aut", "cre"), comment = "Research Scholar, NYU Stern")
|
||||
Description: Utility to simplify country name matching.
|
||||
License: MIT
|
||||
URL: https://www.github.com/philbastian/matchcountry
|
||||
Encoding: UTF-8
|
||||
LazyData: true
|
||||
Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.1.0
|
|
@ -0,0 +1,4 @@
|
|||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(download.mc)
|
||||
export(match.country)
|
|
@ -0,0 +1,25 @@
|
|||
#' Download the MatchCountry tables from Github
|
||||
#'
|
||||
#' @param match Path to the match csv file
|
||||
#' @param countrydata Path to the countrydata csv file
|
||||
#'
|
||||
#' This updates the Match Country files in the library to their current versions. It will overwrite
|
||||
#' the existing files, so this should be used with care if reproducability of older analyses is
|
||||
#' a priority. You can also use this to make your own custom files for matching.
|
||||
#'
|
||||
#' @examples
|
||||
#' download.mc()
|
||||
#'
|
||||
#' @export
|
||||
download.mc <- function(match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv",
|
||||
countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv") {
|
||||
|
||||
i <- readLines(ii <- url(match), encoding = "UTF-8", warn = FALSE)
|
||||
j <- readLines(jj <- url(countrydata), encoding = "UTF-8", warn = FALSE)
|
||||
|
||||
close(ii)
|
||||
close(jj)
|
||||
|
||||
writeLines(i, system.file("match.csv", "matchcountry"))
|
||||
writeLines(j, system.file("countrydata.csv", "matchcountry"))
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
#' Match country
|
||||
#'
|
||||
#' @param country A vector of country names or ISO 3 digit alpha codes
|
||||
#' @param output A column of the MatchCountry table (mc) to return
|
||||
#' @param language The language of the inputs (by default, English; NULL will match on all)
|
||||
#' languages)
|
||||
#' @param match A csv file to use for matching
|
||||
#' @param countrydata A csv file to use
|
||||
#'
|
||||
#' @return A vector with the same length as country with the matching results
|
||||
#' @details This function is designed to recognize different variations of country names and
|
||||
#' standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and
|
||||
#' Nevis" all refer to the same place, but a simple merge with a table would fail to match them
|
||||
#' all. The database has a table of common alternative names. Further, the algorithm removes
|
||||
#' extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
|
||||
#' cannot be found, NA is returned in its place.
|
||||
#'
|
||||
#' If no language is specified, the algorithm will match on all languages, but this is less
|
||||
#' efficient and could be prone to errors.
|
||||
#'
|
||||
#' The default output is the iso field of the countrydata data frame; any column of the countrydata
|
||||
#' can be used, however.
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' match.country("United Republic of Tanzania")
|
||||
#' match.country("Tanzania")
|
||||
#'
|
||||
#' @export
|
||||
|
||||
match.country <- function(country, output = "iso", language = "english",
|
||||
match = read.csv(system.file("match.csv", "matchcountry"),
|
||||
na.strings = "", stringsAsFactors = FALSE),
|
||||
countrydata = read.csv(system.file("countrydata.csv", "matchcountry"),
|
||||
na.strings = "", stringsAsFactors = FALSE)) {
|
||||
removepunctuation <- function(input) {
|
||||
replace <- c("&" = "AND", "SAINT" = "ST", "ISDS" = "ISLANDS", "REPUBLIC OF" = "")
|
||||
|
||||
for(a in 1:length(replace)) input <- gsub(names(replace)[a], replace[a], toupper(input))
|
||||
|
||||
gsub("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "", input)
|
||||
}
|
||||
|
||||
m1 <- match
|
||||
|
||||
if(!is.null(language)) {
|
||||
m1 <- m1[toupper(match$language) == toupper(language),]
|
||||
}
|
||||
|
||||
m1$match <- removepunctuation(m1$name)
|
||||
m1 <- m1[!duplicated(m1$match),]
|
||||
|
||||
row.names(m1) <- m1$match
|
||||
|
||||
isos <- m1[removepunctuation(country),]
|
||||
|
||||
if(output != "iso") {
|
||||
m2 <- countrydata[c("iso", output)]
|
||||
row.names(m2) <- m2$iso
|
||||
isos[[output]] <- m2[isos$iso, output]
|
||||
}
|
||||
|
||||
return(isos[[output]])
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/download.mc.R
|
||||
\name{download.mc}
|
||||
\alias{download.mc}
|
||||
\title{Download the MatchCountry tables from Github}
|
||||
\usage{
|
||||
download.mc(
|
||||
match = "https://raw.githubusercontent.com/philbastian/matchcountry/master/match.csv",
|
||||
|
||||
countrydata = "https://raw.githubusercontent.com/philbastian/matchcountry/master/countrydata.csv"
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{match}{Path to the match csv file}
|
||||
|
||||
\item{countrydata}{Path to the countrydata csv file
|
||||
|
||||
This updates the Match Country files in the library to their current versions. It will overwrite
|
||||
the existing files, so this should be used with care if reproducability of older analyses is
|
||||
a priority. You can also use this to make your own custom files for matching.}
|
||||
}
|
||||
\description{
|
||||
Download the MatchCountry tables from Github
|
||||
}
|
||||
\examples{
|
||||
download.mc()
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/match.country.R
|
||||
\name{match.country}
|
||||
\alias{match.country}
|
||||
\title{Match country}
|
||||
\usage{
|
||||
match.country(
|
||||
country,
|
||||
output = "iso",
|
||||
language = "english",
|
||||
match = read.csv(system.file("match.csv", "matchcountry"), na.strings = "",
|
||||
stringsAsFactors = FALSE),
|
||||
countrydata = read.csv(system.file("countrydata.csv", "matchcountry"), na.strings =
|
||||
"", stringsAsFactors = FALSE)
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{country}{A vector of country names or ISO 3 digit alpha codes}
|
||||
|
||||
\item{output}{A column of the MatchCountry table (mc) to return}
|
||||
|
||||
\item{language}{The language of the inputs (by default, English; NULL will match on all)
|
||||
languages)}
|
||||
|
||||
\item{match}{A csv file to use for matching}
|
||||
|
||||
\item{countrydata}{A csv file to use}
|
||||
}
|
||||
\value{
|
||||
A vector with the same length as country with the matching results
|
||||
}
|
||||
\description{
|
||||
Match country
|
||||
}
|
||||
\details{
|
||||
This function is designed to recognize different variations of country names and
|
||||
standardize them. For example, "St. Kitts & Nevis", "Saint Kitts & Nevis" and "St Kitts and
|
||||
Nevis" all refer to the same place, but a simple merge with a table would fail to match them
|
||||
all. The database has a table of common alternative names. Further, the algorithm removes
|
||||
extended characters that might lead to confusion (St. vs. Saint, for example). Where a match
|
||||
cannot be found, NA is returned in its place.
|
||||
|
||||
If no language is specified, the algorithm will match on all languages, but this is less
|
||||
efficient and could be prone to errors.
|
||||
|
||||
The default output is the iso field of the countrydata data frame; any column of the countrydata
|
||||
can be used, however.
|
||||
}
|
||||
\examples{
|
||||
|
||||
match.country("United Republic of Tanzania")
|
||||
match.country("Tanzania")
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
Loading…
Reference in New Issue