PsychWordVec
uses two types of word vectors data:
wordvec
(data.table, with two variables word
and vec
)
and embed
(matrix, with dimensions as columns and words as row names).
Note that matrix operation makes embed
much faster than wordvec
.
Users are suggested to reshape data to embed
before using the other functions.
Usage
as_embed(x, normalize = FALSE)
as_wordvec(x, normalize = FALSE)
# S3 method for embed
[(x, i, j)
pattern(pattern)
Arguments
- x
Object to be reshaped. See examples.
- normalize
Normalize all word vectors to unit length? Defaults to
FALSE
. Seenormalize
.- i, j
Row (
i
) and column (j
) filter to be used inembed[i, j]
.- pattern
Regular expression to be used in
embed[pattern("...")]
.
Functions
as_embed()
: Fromwordvec
(data.table) toembed
(matrix).as_wordvec()
: Fromembed
(matrix) towordvec
(data.table).
Download
Download pre-trained word vectors data (.RData
):
https://psychbruce.github.io/WordVector_RData.pdf
Examples
dt = head(demodata, 10)
str(dt)
#> Class "wordvec" [10 × 2] (inherits: "data.table")
#> $ word : "in" "for" "that" "is" "on" ...
#> $ vec : list of 10
#> - attr(*, "dims") = 300
#> - attr(*, "normalized") = FALSE
embed = as_embed(dt, normalize=TRUE)
embed
#> # embed (matrix): [10 × 300] (normalized)
#> dim1 ... dim300
#> 1: in 0.0530 ... <300 dims>
#> 2: for -0.0085 ... <300 dims>
#> 3: that -0.0124 ... <300 dims>
#> 4: is 0.0037 ... <300 dims>
#> 5: on 0.0167 ... <300 dims>
#> 6: with -0.0160 ... <300 dims>
#> 7: said -0.0045 ... <300 dims>
#> 8: was 0.0126 ... <300 dims>
#> 9: the 0.0747 ... <300 dims>
#> 10: at -0.0354 ... <300 dims>
str(embed)
#> Class "embed" [10 × 300] (inherits: "matrix")
#> - rownames(*) : "in" "for" "that" "is" "on" ...
#> - colnames(*) : "dim1" "dim2" "dim3" "dim4" "dim5" ...
#> - attr(*, "dims") = 300
#> - attr(*, "normalized") = TRUE
wordvec = as_wordvec(embed, normalize=TRUE)
wordvec
#> # wordvec (data.table): [10 × 2] (normalized)
#> word vec
#> 1: in [ 0.0530, ...<300 dims>]
#> 2: for [-0.0085, ...<300 dims>]
#> 3: that [-0.0124, ...<300 dims>]
#> 4: is [ 0.0037, ...<300 dims>]
#> 5: on [ 0.0167, ...<300 dims>]
#> 6: with [-0.0160, ...<300 dims>]
#> 7: said [-0.0045, ...<300 dims>]
#> 8: was [ 0.0126, ...<300 dims>]
#> 9: the [ 0.0747, ...<300 dims>]
#> 10: at [-0.0354, ...<300 dims>]
str(wordvec)
#> Class "wordvec" [10 × 2] (inherits: "data.table")
#> $ word : "in" "for" "that" "is" "on" ...
#> $ vec : list of 10
#> - attr(*, "dims") = 300
#> - attr(*, "normalized") = TRUE
df = data.frame(token=LETTERS, D1=1:26/10000, D2=26:1/10000)
as_embed(df)
#> # embed (matrix): [26 × 2] (NOT normalized)
#> dim1 ... dim2
#> 1: A 0.0001 ... <2 dims>
#> 2: B 0.0002 ... <2 dims>
#> 3: C 0.0003 ... <2 dims>
#> 4: D 0.0004 ... <2 dims>
#> 5: E 0.0005 ... <2 dims>
#> 6: F 0.0006 ... <2 dims>
#> 7: G 0.0007 ... <2 dims>
#> 8: H 0.0008 ... <2 dims>
#> 9: I 0.0009 ... <2 dims>
#> 10: J 0.0010 ... <2 dims>
#> 11: K 0.0011 ... <2 dims>
#> 12: L 0.0012 ... <2 dims>
#> 13: M 0.0013 ... <2 dims>
#> 14: N 0.0014 ... <2 dims>
#> 15: O 0.0015 ... <2 dims>
#> 16: P 0.0016 ... <2 dims>
#> 17: Q 0.0017 ... <2 dims>
#> 18: R 0.0018 ... <2 dims>
#> 19: S 0.0019 ... <2 dims>
#> 20: T 0.0020 ... <2 dims>
#> 21: U 0.0021 ... <2 dims>
#> 22: V 0.0022 ... <2 dims>
#> 23: W 0.0023 ... <2 dims>
#> 24: X 0.0024 ... <2 dims>
#> 25: Y 0.0025 ... <2 dims>
#> 26: Z 0.0026 ... <2 dims>
as_wordvec(df)
#> # wordvec (data.table): [26 × 2] (NOT normalized)
#> word vec
#> 1: A [ 0.0001, ...<2 dims>]
#> 2: B [ 0.0002, ...<2 dims>]
#> 3: C [ 0.0003, ...<2 dims>]
#> 4: D [ 0.0004, ...<2 dims>]
#> 5: E [ 0.0005, ...<2 dims>]
#> 6: F [ 0.0006, ...<2 dims>]
#> 7: G [ 0.0007, ...<2 dims>]
#> 8: H [ 0.0008, ...<2 dims>]
#> 9: I [ 0.0009, ...<2 dims>]
#> 10: J [ 0.0010, ...<2 dims>]
#> 11: K [ 0.0011, ...<2 dims>]
#> 12: L [ 0.0012, ...<2 dims>]
#> 13: M [ 0.0013, ...<2 dims>]
#> 14: N [ 0.0014, ...<2 dims>]
#> 15: O [ 0.0015, ...<2 dims>]
#> 16: P [ 0.0016, ...<2 dims>]
#> 17: Q [ 0.0017, ...<2 dims>]
#> 18: R [ 0.0018, ...<2 dims>]
#> 19: S [ 0.0019, ...<2 dims>]
#> 20: T [ 0.0020, ...<2 dims>]
#> 21: U [ 0.0021, ...<2 dims>]
#> 22: V [ 0.0022, ...<2 dims>]
#> 23: W [ 0.0023, ...<2 dims>]
#> 24: X [ 0.0024, ...<2 dims>]
#> 25: Y [ 0.0025, ...<2 dims>]
#> 26: Z [ 0.0026, ...<2 dims>]
dd = rbind(dt[1:5], dt[1:5])
dd # duplicate words
#> # wordvec (data.table): [10 × 2] (NOT normalized)
#> word vec
#> 1: in [ 0.0703, ...<300 dims>]
#> 2: for [-0.0118, ...<300 dims>]
#> 3: that [-0.0157, ...<300 dims>]
#> 4: is [ 0.0070, ...<300 dims>]
#> 5: on [ 0.0267, ...<300 dims>]
#> 6: in [ 0.0703, ...<300 dims>]
#> 7: for [-0.0118, ...<300 dims>]
#> 8: that [-0.0157, ...<300 dims>]
#> 9: is [ 0.0070, ...<300 dims>]
#> 10: on [ 0.0267, ...<300 dims>]
#> ! 5 duplicate words: use `unique()` to delete duplicates
unique(dd)
#> # wordvec (data.table): [5 × 2] (NOT normalized)
#> word vec
#> 1: in [ 0.0703, ...<300 dims>]
#> 2: for [-0.0118, ...<300 dims>]
#> 3: that [-0.0157, ...<300 dims>]
#> 4: is [ 0.0070, ...<300 dims>]
#> 5: on [ 0.0267, ...<300 dims>]
dm = as_embed(dd)
dm # duplicate words
#> # embed (matrix): [10 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: in 0.0703 ... <300 dims>
#> 2: for -0.0118 ... <300 dims>
#> 3: that -0.0157 ... <300 dims>
#> 4: is 0.0070 ... <300 dims>
#> 5: on 0.0267 ... <300 dims>
#> 6: in 0.0703 ... <300 dims>
#> 7: for -0.0118 ... <300 dims>
#> 8: that -0.0157 ... <300 dims>
#> 9: is 0.0070 ... <300 dims>
#> 10: on 0.0267 ... <300 dims>
#> ! 5 duplicate words: use `unique()` to delete duplicates
unique(dm)
#> # embed (matrix): [5 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: in 0.0703 ... <300 dims>
#> 2: for -0.0118 ... <300 dims>
#> 3: that -0.0157 ... <300 dims>
#> 4: is 0.0070 ... <300 dims>
#> 5: on 0.0267 ... <300 dims>
# more examples for extracting a subset using `x[i, j]`
# (3x faster than `wordvec`)
embed = as_embed(demodata)
embed[1]
#> # embed (matrix): [1 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: in 0.0703 ... <300 dims>
embed[1:5]
#> # embed (matrix): [5 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: in 0.0703 ... <300 dims>
#> 2: for -0.0118 ... <300 dims>
#> 3: that -0.0157 ... <300 dims>
#> 4: is 0.0070 ... <300 dims>
#> 5: on 0.0267 ... <300 dims>
embed["for"]
#> # embed (matrix): [1 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: for -0.0118 ... <300 dims>
embed[pattern("^for.{0,2}$")]
#> # embed (matrix): [6 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: for -0.0118 ... <300 dims>
#> 2: form 0.0835 ... <300 dims>
#> 3: force 0.1641 ... <300 dims>
#> 4: forms -0.0206 ... <300 dims>
#> 5: forum -0.1309 ... <300 dims>
#> 6: forth -0.1074 ... <300 dims>
embed[cc("for, in, on, xxx")]
#> ✖ 1 word not found: "xxx"
#> # embed (matrix): [3 × 300] (NOT normalized)
#> dim1 ... dim300
#> 1: for -0.0118 ... <300 dims>
#> 2: in 0.0703 ... <300 dims>
#> 3: on 0.0267 ... <300 dims>
embed[cc("for, in, on, xxx"), 5:10]
#> ✖ 1 word not found: "xxx"
#> # embed (matrix): [3 × 6] (NOT normalized)
#> dim5 ... dim10
#> 1: for -0.0182 ... <6 dims>
#> 2: in 0.0693 ... <6 dims>
#> 3: on 0.0062 ... <6 dims>
embed[1:5, 5:10]
#> # embed (matrix): [5 × 6] (NOT normalized)
#> dim5 ... dim10
#> 1: in 0.0693 ... <6 dims>
#> 2: for -0.0182 ... <6 dims>
#> 3: that -0.1104 ... <6 dims>
#> 4: is -0.1328 ... <6 dims>
#> 5: on 0.0062 ... <6 dims>
embed[, 5:10]
#> # embed (matrix): [8000 × 6] (NOT normalized)
#> dim5 ... dim10
#> 1: in 0.0693 ... <6 dims>
#> 2: for -0.0182 ... <6 dims>
#> 3: that -0.1104 ... <6 dims>
#> 4: is -0.1328 ... <6 dims>
#> 5: on 0.0062 ... <6 dims>
#> -----
#> 7996: salesperson 0.0172 ... <6 dims>
#> 7997: computation 0.1465 ... <6 dims>
#> 7998: psychotherapy 0.2539 ... <6 dims>
#> 7999: equations -0.4082 ... <6 dims>
#> 8000: psychotherapist 0.0198 ... <6 dims>
embed[3, 4]
#> that [dim4]
#> 0.050293
embed["that", 4]
#> that [dim4]
#> 0.050293