Comments (3)
suppressPackageStartupMessages({
library(hardhat)
library(vctrs)
library(gapminder)
library(rsample)
library(dplyr)
})
#> Warning: package 'rsample' was built under R version 3.5.2
#> Warning: package 'dplyr' was built under R version 3.5.2
options(rlang__backtrace_on_error = "none")
split <- initial_split(gapminder)
gap_train <- training(split)
gap_test <- testing(split)
# 0 row slice (this is `info`!)
gap_train_0 <- vec_slice(gap_train, 0L)
# all of the df - df casts are due to df_col_cast()
# which is called from vec_cast()
# ///////////////////////
# continent is character not factor
gap_test2 <- mutate(gap_test, continent = as.character(continent))
# silently fixes that
common2 <- vec_cast(gap_test2, gap_train_0)
# recovered levels and type
levels(common2$continent)
#> [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
# Takeaway) This is good
# ///////////////////////
# continent is character not factor, AND has too many implicit levels internally
gap_test2b <- mutate(gap_test, continent = as.character(continent))
gap_test2b$continent[1] <- "jupyter"
# noisily fixes that
common2b <- vec_cast(gap_test2b, gap_train_0)
#> Warning: Lossy cast from <character> to <factor<69262>>
#> Locations: 1
# recovered levels and type
levels(common2b$continent)
#> [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
# Takeaway) This is good, but a better warning would be
# good. This is a TOO MANY LEVELS problem.
# ///////////////////////
# continent is character not factor, AND doesn't have enough levels internally
gap_test2c <- mutate(gap_test, continent = as.character(continent))
gap_test2c$continent <- gsub("Asia", NA_character_, gap_test2c$continent)
# silently fixes that
common2c <- vec_cast(gap_test2c, gap_train_0)
# recovered levels and type
levels(common2c$continent)
#> [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
# Takeaway) This is good
# ///////////////////////
# continent is numeric
gap_test3 <- mutate(gap_test, continent = 1)
# not the best error message
common3 <- vec_cast(gap_test3, gap_train_0)
#> Error: Can't cast <double> to <factor<69262>>
# Takeaway) This is good, but want a better error message
# would really like this better message to be vctrs' job
# ///////////////////////
# too many columns
gap_test4 <- mutate(gap_test, x = 4)
# this actually throws a decent error message,
# but I think we would still rather have our own
vec_cast(gap_test4, gap_train)
#> Warning: Lossy cast from <tbl_df<
#> country : factor<bf6dc>
#> continent: factor<69262>
#> year : integer
#> lifeExp : double
#> pop : integer
#> gdpPercap: double
#> x : double
#> >> to <tbl_df<
#> country : factor<bf6dc>
#> continent: factor<69262>
#> year : integer
#> lifeExp : double
#> pop : integer
#> gdpPercap: double
#> >>
#> Locations:
#> Dropped variables: `x`
#> # A tibble: 426 x 6
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan Asia 1952 28.8 8425333 779.
#> 2 Afghanistan Asia 1987 40.8 13867957 852.
#> 3 Albania Europe 1987 72 3075321 3739.
#> 4 Albania Europe 1992 71.6 3326498 2497.
#> 5 Albania Europe 2002 75.7 3508512 4604.
#> 6 Algeria Africa 1957 45.7 10270856 3014.
#> 7 Algeria Africa 1977 58.0 17152804 4910.
#> 8 Algeria Africa 1987 65.8 23254956 5681.
#> 9 Algeria Africa 2007 72.3 33333216 6223.
#> 10 Angola Africa 1962 34 4826015 4269.
#> # … with 416 more rows
# Takeaway) This is good, but probably want a custom error message
# because vctrs "types" its warnings, I think I can do this
# ///////////////////////
# not enough columns
gap_test5 <- select(gap_test, -pop)
# silently adds column of NA values
vec_cast(gap_test5, gap_train_0)
#> # A tibble: 426 x 6
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan Asia 1952 28.8 NA 779.
#> 2 Afghanistan Asia 1987 40.8 NA 852.
#> 3 Albania Europe 1987 72 NA 3739.
#> 4 Albania Europe 1992 71.6 NA 2497.
#> 5 Albania Europe 2002 75.7 NA 4604.
#> 6 Algeria Africa 1957 45.7 NA 3014.
#> 7 Algeria Africa 1977 58.0 NA 4910.
#> 8 Algeria Africa 1987 65.8 NA 5681.
#> 9 Algeria Africa 2007 72.3 NA 6223.
#> 10 Angola Africa 1962 34 NA 4269.
#> # … with 416 more rows
# Takeaway) We should let shrink() be noisy here and error
# ///////////////////////
# Too many levels, but the factor's actual values never use that level
gap_test6 <- mutate(gap_test, continent = factor(continent, c(levels(continent), "extra_level")))
# silently fixes that
common6 <- vec_cast(gap_test6, gap_train_0)
levels(common6$continent)
#> [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
# Takeaway) Silence is okay here as the values aren't actually affected
# ///////////////////////
# Too many levels (in test), AND the factor's actual values use that level
# (here we drop Africa from the train data to demonstrate)
gap_train_0_6b <- mutate(gap_train_0, continent = factor(continent, levels(continent)[-1]))
# noisily drops the level and coerces problematic
# positions to NA
common6b <- vec_cast(gap_test, gap_train_0_6b)
#> Warning: Lossy cast from <factor<69262>> to <factor<e5252>>
#> Locations: 6, 7, 8, 9, 10, 11, 12, 13, 33, 34, 35, 42, 43, 47, 48, 49, 5...
# no africa
levels(common6b$continent)
#> [1] "Americas" "Asia" "Europe" "Oceania"
# Takeaway) Noisy is good, but I think I want a different warning.
# Again, capture the typed warning. This could check if `x` is a factor
# then you'd know the lossy cast is specific to having too many factor levels
# ///////////////////////
# not enough levels (in test)
gap_test7 <- mutate(gap_test, continent = factor(continent, levels(continent)[-1]))
# silently fixes that
common7 <- vec_cast(gap_test7, gap_train_0)
levels(common7$continent)
#> [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
Created on 2019-02-25 by the reprex package (v0.2.1.9000)
from hardhat.
Closed by #71
from hardhat.
This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.
from hardhat.
Related Issues (20)
- forge fails with non-standard roles HOT 4
- Release hardhat 1.0.0 HOT 1
- Error in out$extras$final(predictors_extras, outcomes_extras): argument "outcomes_extras" is missing, with no default HOT 15
- Release hardhat 1.1.0 HOT 2
- get_outcome_levels blueprint method? HOT 5
- Post recipes 1.0.0 upkeep HOT 1
- Release hardhat 1.2.0 HOT 1
- Pass through strings_as_factors arg HOT 3
- Problem with predicting on new data for character column HOT 5
- Problem with a formula with spaces in the name of a factor and indicators = "none" HOT 2
- `mold()` inconsistently preserves (with XY method) or ignores (with formula method) non-base vector classes HOT 2
- Do a pass over `validate_is()` with an eye towards performance HOT 1
- Avoid `as_tibble()` where possible HOT 2
- multi-outcomes support for `spruce_prob_multi` shall clarify input format for multiple `pred_levels` HOT 2
- Upkeep for hardhat HOT 1
- Regression in development version of hardhat when using sf objects HOT 3
- Release hardhat 1.3.0 HOT 1
- Dynamically calculate weights HOT 5
- Using a division on the left-hand side of a formula throws an "Interaction terms can't be specified on the LHS of `formula`" HOT 2
- importance weights not compatible with DALEXtra::model_profile HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from hardhat.