Fix #32 (#35)

* Fix #32 * Fix the edge case of exclusive leading spaces when `trim_ws`'s FALSE * Update Benchmark * Bump ver.
gesistsa · Jun 11, 2024 · 9973c33 · 9973c33
1 parent 7d08b05
commit 9973c33
Show file tree

Hide file tree

Showing 13 changed files with 119 additions and 39 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: minty
 Title: Minimal Type Guesser
-Version: 0.0.3
+Version: 0.0.4
 Authors@R: c(
     person("Chung-hong", "Chan", role = c("aut", "cre"), email = "[email protected]", comment = c(ORCID = "0000-0002-6232-7530")),
     person("Hadley", "Wickham", , "[email protected]", role = "aut", comment = "author of the ported code from readr"),

diff --git a/R/cpp11.R b/R/cpp11.R
@@ -1,7 +1,7 @@
 # Generated by cpp11: do not edit by hand
 
-collectorGuess <- function(input, locale_, guessInteger, guess_max) {
-  .Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max)
+collectorGuess <- function(input, locale_, guessInteger, guess_max, trim_ws) {
+  .Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max, trim_ws)
 }
 
 parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws) {

diff --git a/R/parser.R b/R/parser.R
@@ -174,7 +174,6 @@ col_number <- function() {
     collector("number")
 }
 
-
 #' Parse using the "best" type
 #'
 #' `parse_guess()` returns the parser vector. This function uses a number of heuristics
@@ -203,7 +202,8 @@ col_number <- function() {
 #' # ISO 8601 date times
 #' parse_guess(c("2010-10-10"))
 parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, guess_max = NA, .return_problems = FALSE) {
-    parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max), na = na, locale = locale, trim_ws = trim_ws,
+    parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max, trim_ws = trim_ws),
+                 na = na, locale = locale, trim_ws = trim_ws,
                  .return_problems = .return_problems)
 }
 
@@ -213,7 +213,7 @@ col_guess <- function() {
     collector("guess")
 }
 
-guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000) {
+guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000, trim_ws = FALSE) {
     x[x %in% na] <- NA_character_
     stopifnot(is.locale(locale))
     if (is.na(guess_max)) {
@@ -223,7 +223,7 @@ guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na
     if (abs(guess_max) == Inf || is.nan(guess_max) || guess_max < 1 || is.na(guess_max)) {
         guess_max <- length(x)
     }
-    collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max))
+    collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max), trim_ws)
 }
 
 #' Parse factors

diff --git a/R/type_convert.R b/R/type_convert.R
@@ -46,7 +46,8 @@ type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
         locale = locale,
         na = na,
         guess_integer = guess_integer,
-        guess_max = guess_max
+        guess_max = guess_max,
+        trim_ws = trim_ws
     )
 
     specs <- col_spec_standardise(

diff --git a/README.Rmd b/README.Rmd
@@ -22,7 +22,9 @@ knitr::opts_chunk$set(
 
 `minty` (**Min**imal **ty**pe guesser) is a package with the type inferencing and parsing tools (the so-called 1e parsing engine) extracted from `readr` (with permission, see this issue [tidyverse/readr#1517](https://github.com/tidyverse/readr/issues/1517)). Since July 2021, these tools are not used internally by `readr` for parsing text files. Now `vroom` is used by default, unless explicitly call the first edition parsing engine (see the explanation on [editions](https://github.com/tidyverse/readr?tab=readme-ov-file#editions)).
 
-`readr`'s 1e type inferencing and parsing tools are used by various R packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory objects, but those packages do not use the main functions (e.g. `readr::read_delim()`) of `readr`. As explained in the README of `readr`, those 1e code will be eventually removed from `readr`. `minty` aims at providing a set of minimal, long-term, and compatible type inferencing and parsing tools for those packages.
+`readr`'s 1e type inferencing and parsing tools are used by various R packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory objects, but those packages do not use the main functions (e.g. `readr::read_delim()`) of `readr`. As explained in the README of `readr`, those 1e code will be eventually removed from `readr`.
+
+`minty` aims at providing a set of minimal, long-term, and compatible type inferencing and parsing tools for those packages. You might consider `minty` to be 1.5e parsing engine.
 
 ## Installation
 
@@ -165,6 +167,25 @@ minty::parse_guess(c("1", "2", "drei"), guess_max = 2)
 readr::parse_guess(c("1", "2", "drei"))
 ```
 
+For `parse_guess()` and `type_convert()`, `trim_ws` is considered before type guessing (the expected behavior of `vroom::vroom()` / `readr::read_delim()`).
+
+```{r}
+minty::parse_guess(c("   1", " 2 ", " 3  "), trim_ws = TRUE)
+```
+
+```{r}
+readr::parse_guess(c("   1", " 2 ", " 3  "), trim_ws = TRUE)
+```
+
+```{r}
+##tidyverse/readr#1536
+minty::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str()
+```
+
+```{r}
+readr::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str()
+```
+
 ## Similar packages
 
 For parsing ambiguous date(time)

diff --git a/README.md b/README.md
@@ -23,9 +23,11 @@ call the first edition parsing engine (see the explanation on
 packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory
 objects, but those packages do not use the main functions
 (e.g. `readr::read_delim()`) of `readr`. As explained in the README of
-`readr`, those 1e code will be eventually removed from `readr`. `minty`
-aims at providing a set of minimal, long-term, and compatible type
-inferencing and parsing tools for those packages.
+`readr`, those 1e code will be eventually removed from `readr`.
+
+`minty` aims at providing a set of minimal, long-term, and compatible
+type inferencing and parsing tools for those packages. You might
+consider `minty` to be 1.5e parsing engine.
 
 ## Installation
 
@@ -246,6 +248,41 @@ readr::parse_guess(c("1", "2", "drei"))
 #> [1] "1"    "2"    "drei"
 ```
 
+For `parse_guess()` and `type_convert()`, `trim_ws` is considered before
+type guessing (the expected behavior of `vroom::vroom()` /
+`readr::read_delim()`).
+
+``` r
+minty::parse_guess(c("   1", " 2 ", " 3  "), trim_ws = TRUE)
+#> [1] 1 2 3
+```
+
+``` r
+readr::parse_guess(c("   1", " 2 ", " 3  "), trim_ws = TRUE)
+#> [1] "1" "2" "3"
+```
+
+``` r
+##tidyverse/readr#1536
+minty::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str()
+#> 'data.frame':    1 obs. of  2 variables:
+#>  $ a: num 1
+#>  $ b: num 2
+```
+
+``` r
+readr::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str()
+#> 
+#> ── Column specification ────────────────────────────────────────────────────────
+#> cols(
+#>   a = col_character(),
+#>   b = col_double()
+#> )
+#> 'data.frame':    1 obs. of  2 variables:
+#>  $ a: chr "1"
+#>  $ b: num 2
+```
+
 ## Similar packages
 
 For parsing ambiguous date(time)

diff --git a/misc/benchmark.md b/misc/benchmark.md
@@ -7,7 +7,7 @@ suppressPackageStartupMessages(library(readr))
 Sys.time()
 ```
 
-    [1] "2024-06-10 13:52:45 CEST"
+    [1] "2024-06-11 11:50:29 CEST"
 
 Under 200 rows, simple
 
@@ -19,7 +19,7 @@ bench::mark(minty::type_convert(iris_chr), iterations = 10)
     # A tibble: 1 × 6
       expression                         min   median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
-    1 minty::type_convert(iris_chr)    387µs    410µs     2377.     702KB        0
+    1 minty::type_convert(iris_chr)    394µs    426µs     2245.     703KB        0
 
 ``` r
 bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10)
@@ -28,7 +28,7 @@ bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10)
     # A tibble: 1 × 6
       expression                             min median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                          <bch:> <bch:>     <dbl> <bch:byt>    <dbl>
-    1 suppressMessages(readr::type_conve… 2.15ms 2.21ms      365.    1.81MB        0
+    1 suppressMessages(readr::type_conve… 2.13ms 2.18ms      371.    1.81MB        0
 
 Many rows
 
@@ -43,7 +43,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE), iterati
     # A tibble: 1 × 6
       expression                             min median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                           <bch> <bch:>     <dbl> <bch:byt>    <dbl>
-    1 x <- minty::type_convert(flights_ch… 1.02s  1.06s     0.940     189MB     17.1
+    1 x <- minty::type_convert(flights_ch… 1.06s  1.12s     0.895     189MB     16.3
 
 ``` r
 bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5)
@@ -55,7 +55,7 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer
     # A tibble: 1 × 6
       expression                             min median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                           <bch> <bch:>     <dbl> <bch:byt>    <dbl>
-    1 y <- suppressMessages(readr::type_c… 1.01s  1.04s     0.954     153MB     17.2
+    1 y <- suppressMessages(readr::type_c… 978ms  1.01s     0.989     153MB     17.8
 
 ``` r
 all.equal(x, y)
@@ -75,7 +75,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_ma
     # A tibble: 1 × 6
       expression                             min median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                           <bch> <bch:>     <dbl> <bch:byt>    <dbl>
-    1 x <- minty::type_convert(flights_ch… 529ms  535ms      1.84     153MB     19.9
+    1 x <- minty::type_convert(flights_ch… 508ms  512ms      1.92     153MB     20.7
 
 ``` r
 bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5)
@@ -87,7 +87,7 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer
     # A tibble: 1 × 6
       expression                             min median `itr/sec` mem_alloc `gc/sec`
       <bch:expr>                           <bch> <bch:>     <dbl> <bch:byt>    <dbl>
-    1 y <- suppressMessages(readr::type_c… 1.02s  1.02s     0.959     153MB     17.5
+    1 y <- suppressMessages(readr::type_c… 981ms  998ms     0.993     153MB     18.1
 
 ``` r
 all.equal(x, y)

diff --git a/src/CollectorGuess.cpp b/src/CollectorGuess.cpp
@@ -10,13 +10,11 @@
 
 typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale);
 
-bool canParse(
-              const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale, unsigned int guess_max) {
+bool canParse(const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale,
+              unsigned int guess_max, bool trim_ws) {
   unsigned int n = 0;
   for (const auto & i : x) {
     n++;
-    //Rprintf("%u\n", n);
-    //Rprintf(i);
     if (n > guess_max) {
       break;
     }
@@ -27,8 +25,8 @@ bool canParse(
     if (i.size() == 0) {
       continue;
     }
-
-    if (!canParseF(std::string(i), pLocale)) {
+    auto i_str = trim_ws ? trimString(std::string(i)) : std::string(i);
+    if (!canParseF(i_str, pLocale)) {
       return false;
     }
   }
@@ -130,7 +128,8 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
     const cpp11::strings& input,
     const cpp11::list& locale_,
     bool guessInteger,
-    unsigned int guess_max) {
+    unsigned int guess_max,
+    bool trim_ws) {
   LocaleInfo locale(static_cast<SEXP>(locale_));
 
   if (input.size() == 0) {
@@ -142,25 +141,25 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
   }
 
   // Work from strictest to most flexible
-  if (canParse(input, isLogical, &locale, guess_max)) {
+  if (canParse(input, isLogical, &locale, guess_max, trim_ws)) {
     return "logical";
   }
-  if (guessInteger && canParse(input, isInteger, &locale, guess_max)) {
+  if (guessInteger && canParse(input, isInteger, &locale, guess_max, trim_ws)) {
     return "integer";
   }
-  if (canParse(input, isDouble, &locale, guess_max)) {
+  if (canParse(input, isDouble, &locale, guess_max, trim_ws)) {
     return "double";
   }
-  if (canParse(input, isNumber, &locale, guess_max)) {
+  if (canParse(input, isNumber, &locale, guess_max, trim_ws)) {
     return "number";
   }
-  if (canParse(input, isTime, &locale, guess_max)) {
+  if (canParse(input, isTime, &locale, guess_max, trim_ws)) {
     return "time";
   }
-  if (canParse(input, isDate, &locale, guess_max)) {
+  if (canParse(input, isDate, &locale, guess_max, trim_ws)) {
     return "date";
   }
-  if (canParse(input, isDateTime, &locale, guess_max)) {
+  if (canParse(input, isDateTime, &locale, guess_max, trim_ws)) {
     return "datetime";
   }
 

diff --git a/src/QiParsers.h b/src/QiParsers.h
@@ -86,8 +86,6 @@ bsd_strtod(const char* begin, const char** endptr, const char decimal_mark) {
    * Strip off leading blanks and check for a sign.
    */
   p = begin;
-  while (p != *endptr && (*p == ' ' || *p == '\t'))
-    ++p;
   if (p != *endptr && *p == '-') {
     sign = 1;
     ++p;

diff --git a/src/cpp11.cpp b/src/cpp11.cpp
@@ -6,10 +6,10 @@
 #include <R_ext/Visibility.h>
 
 // CollectorGuess.cpp
-std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max);
-extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max) {
+std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max, bool trim_ws);
+extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max, SEXP trim_ws) {
   BEGIN_CPP11
-    return cpp11::as_sexp(collectorGuess(cpp11::as_cpp<cpp11::decay_t<const cpp11::strings&>>(input), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(guessInteger), cpp11::as_cpp<cpp11::decay_t<unsigned int>>(guess_max)));
+    return cpp11::as_sexp(collectorGuess(cpp11::as_cpp<cpp11::decay_t<const cpp11::strings&>>(input), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(guessInteger), cpp11::as_cpp<cpp11::decay_t<unsigned int>>(guess_max), cpp11::as_cpp<cpp11::decay_t<bool>>(trim_ws)));
   END_CPP11
 }
 // parse.cpp
@@ -36,7 +36,7 @@ extern "C" SEXP _minty_r_is_string_cpp11(SEXP x) {
 
 extern "C" {
 static const R_CallMethodDef CallEntries[] = {
-    {"_minty_collectorGuess",    (DL_FUNC) &_minty_collectorGuess,    4},
+    {"_minty_collectorGuess",    (DL_FUNC) &_minty_collectorGuess,    5},
     {"_minty_parse_vector_",     (DL_FUNC) &_minty_parse_vector_,     5},
     {"_minty_r_is_string_cpp11", (DL_FUNC) &_minty_r_is_string_cpp11, 1},
     {"_minty_type_convert_col",  (DL_FUNC) &_minty_type_convert_col,  6},

diff --git a/src/utils.h b/src/utils.h
@@ -81,4 +81,11 @@ inline bool starts_with_comment(
   return true;
 }
 
+inline std::string trimString(std::string const &str, std::string const &whitespace=" \r\n\t\v\f") {
+  auto start = str.find_first_not_of(whitespace);
+  auto end = str.find_last_not_of(whitespace);
+
+  return str.substr(start, end - start + 1);
+}
+
 #endif
diff --git a/tests/testthat/test-parsing.R b/tests/testthat/test-parsing.R
@@ -16,3 +16,11 @@ test_that("parse_guess() guess_max", {
     expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 2)), "numeric")
     expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 3)), "character")
 })
+
+test_that("parse_guess() trim_ws #32 or tidyverse/readr#1536", {
+    expect_equal(parse_guess(c(" 1", "2 ", " 3 "), trim_ws = TRUE), c(1, 2, 3))
+    expect_equal(parse_guess(c(" 1", "2 ", " 3 "), trim_ws = FALSE), c(" 1", "2 ", " 3 "))
+    ## exclusive leading and trim_ws = FALSE, won't be parsed as numeric
+    expect_equal(parse_guess(c(" 1", "   2", "     3"), trim_ws = FALSE), c(" 1", "   2", "     3"))
+    expect_equal(parse_guess(c(" TRUE", "FALSE ", " T "), trim_ws = TRUE), c(TRUE, FALSE, TRUE))
+})
diff --git a/tests/testthat/test-type-convert.R b/tests/testthat/test-type-convert.R
@@ -77,6 +77,15 @@ test_that("skip behaviors, readr#1509 or minty#20", {
     expect_error(minty::type_convert(text_only, col_types = list("-", "-")), NA)
 })
 
+test_that("type_convert() trim_ws #32 or tidyverse/readr#1536", {
+    ## integration of guess_parse in type_convert()
+    x <- type_convert(data.frame(a = c("1 ", "  1"), b = c(" 2", "    2")), trim_ws = TRUE)
+    expect_equal(class(x$a), "numeric")
+    expect_equal(class(x$b), "numeric")
+    x <- type_convert(data.frame(a = c("  1")), trim_ws = FALSE)
+    expect_equal(class(x$a), "character")
+})
+
 test_that("r_is_string_cpp11", {
     expect_true(r_is_string_cpp11("a"))
     expect_true(r_is_string_cpp11(c("a")))