賞金將在 6 天后到期。此問題的答案有資格獲得 50聲望賞金。 wake_wake正在尋找一個規范的答案。
假設我有一個data.table
觀察結果是我的消費者一起購買的產品的成對組合。
dt
對于我的 中的每對產品(在 中的一行),我想查找data.table
它們是否有第三種共同的產品,有時也與其中一種產品一起購買。
我想將“常見產品”作為新列包含在dt
.
目前,我這樣做如下。但我的真實資料包含數百萬行。從 1 周開始計算資料需要 20 小時。
我怎樣才能加快速度?一個apply
函式是智能的,還是我應該考慮映射?
模擬示例:
library(data.table)
library(stringi)
library(future.apply)
set.seed(1)
# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
V2 = stri_rand_strings(100, 1))
head(dt,17)
# V1 V2
#1: G e
#2: N L
#3: Z G
#4: u z
#5: C d
#6: t D
# 7: w 8
# 8: e T
# 9: d v
#10: 3 b
#11: C y
#12: A j
#13: g M
#14: N Q
#15: l 9
#16: U 0
#17: i i
#function to find common products
find_products <- function(a, b){
library(data.table)
toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}
#initiate parallel processing
plan(multisession) # on Windows machine - use plan(multicore) on Linux
#apply function across rows
common_products <- future_apply(dt, 1, function(y) find_products(y['V1'], y['V2']))
dt_final <- cbind(dt, common_products)
#head(dt, 17)
# V1 V2 common_products
# 1: G e
# 2: N L
# 3: Z G
# 4: u z
# 5: C d
# 6: t D
# 7: w 8
# 8: e T
# 9: d v
#10: 3 b
#11: C y
#12: A j
#13: g M
#14: N Q
#15: l 9
#16: U 0
#17: i i i, z, B, l
uj5u.com熱心網友回復:
可以將三向對視為無向圖中的三角形。該包igraph
可以有效地找到這些。我包含了一個包含 20M 對 3 字符產品代碼的示例。它在大約 70 秒內在單個執行緒上運行。
library(data.table)
library(stringi)
library(igraph)
getpaired <- function(g, id) names(unique(neighbors(g, id)))
commonProducts <- function(dt) {
blnSort <- dt$V1 > dt$V2
dt[blnSort, c("V2", "V1") := list(V1, V2)] # sort each row
# get triangles
g <- graph_from_data_frame(dt, FALSE)
m <- matrix(V(g)$name[triangles(g)], ncol = 3, byrow = TRUE)
# sort each row
m <- matrix(m[order(row(m), m, method = "radix")], ncol = 3, byrow = TRUE)
dt3 <- as.data.table(m)
# map common products back to the original dataframe
dt3 <- rbindlist(
list(
# the three ordered pairs in each triangle
dt3,
dt3[, c(1, 3, 2)],
dt3[, c(2, 3, 1)],
# common products in "two-sided" triangles
dt[V1 == V2][
, .(V2 = V2, V3 = .(getpaired(g, V1))), by = "V1"
][
, .(V1 = rep(rep.int(V1, lengths(V3)), 2),
V2 = c(rep.int(V1, lengths(V3)), unlist(V3)),
V3 = c(unlist(V3), rep.int(V1, lengths(V3))))
][ # sort (V1, V2) in each row
V1 > V2, c("V2", "V1") := list(V1, V2)
]
),
FALSE # bind by index
)[ # collapse common products into a single vector for each pair
, .(V3 = .(V3)),
by = c("V1", "V2")
][ # join into the original (row-sorted) data.table
dt, on = c("V1", "V2")
][ # unsort V1, V2 in each row to match the original (unsorted) data.table
, c("V1", "V2") := dt[blnSort, c("V2", "V1") := list(V1, V2)]
]
}
set.seed(1)
# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
V2 = stri_rand_strings(100, 1))
dt3 <- commonProducts(dt)
print(dt3)
#> V1 V2 V3
#> 1: G e
#> 2: N L
#> 3: Z G
#> 4: u z
#> 5: C d B
#> 6: t D t
#> 7: w 8
#> 8: e T
#> 9: d v
#> 10: 3 b
#> 11: C y
#> 12: A j
#> 13: g M
#> 14: N Q
#> 15: l 9
#> 16: U 0
#> 17: i i i,7,E,B,5,z,...
#> 18: z 6
#> 19: N R S
#> 20: m d
#> 21: v z
#> 22: D U
#> 23: e U
#> 24: 7 A
#> 25: G k
#> 26: N S R
#> 27: 0 V
#> 28: N C
#> 29: r E
#> 30: L a
#> 31: T Z
#> 32: b 4
#> 33: U 2
#> 34: B d C,6
#> 35: p v
#> 36: f b
#> 37: n Y
#> 38: 6 W j
#> 39: i z i
#> 40: P V
#> 41: o g
#> 42: e b
#> 43: m E
#> 44: Y G
#> 45: W j
#> 46: m S
#> 47: 1 A
#> 48: T k
#> 49: j 6 W
#> 50: g r
#> 51: T c
#> 52: r Y
#> 53: R K
#> 54: F S
#> 55: 4 V
#> 56: 6 B d
#> 57: J W
#> 58: W 4
#> 59: f H
#> 60: P D
#> 61: u H
#> 62: I t t
#> 63: S R N
#> 64: K m
#> 65: e s
#> 66: F P
#> 67: T 3
#> 68: l K
#> 69: 5 i i
#> 70: s K O
#> 71: L d
#> 72: q q P,q,q
#> 73: L r d
#> 74: K O s
#> 75: T N
#> 76: t t D,I,t,x,t
#> 77: r d L
#> 78: O j
#> 79: m b
#> 80: x t t
#> 81: Q I
#> 82: i B i
#> 83: O s K
#> 84: K V
#> 85: k s
#> 86: C B d
#> 87: i l i
#> 88: 7 i i
#> 89: F w
#> 90: 8 X
#> 91: E i i
#> 92: 3 O
#> 93: d 6 B
#> 94: s v
#> 95: m H
#> 96: n a
#> 97: S 6
#> 98: P q q
#> 99: o J
#> 100: b m
#> V1 V2 V3
# timing a much larger dataset
dt <- data.table(V1 = stri_rand_strings(2e7, 3),
V2 = stri_rand_strings(2e7, 3))
system.time(dt3 <- commonProducts(dt))
#> user system elapsed
#> 72.75 3.05 71.88
dt3[lengths(V3) != 0L] # show only those pairs with common products
#> V1 V2 V3
#> 1: GBW mDN lxF
#> 2: ix6 jpR 0VI
#> 3: xLG VeE aik
#> 4: A36 RzJ YYu
#> 5: zAo OYu zAo
#> ---
#> 1841567: qX9 xrW 7lb
#> 1841568: knO 2G6 knO
#> 1841569: rsU 5Rw ER8
#> 1841570: Bts 3L1 1bQ
#> 1841571: c0h pgd jxJ
V1
這處理==時創建的“雙面三角形” V2
(與 OP 示例資料中的第 17 行一樣)。例如,如果整個資料集由 和 對組成(t, i)
,(i, i)
那么i
將是(t, i)
(i
與兩者配對) 的共同乘積,并且t
將是(并且每個都與兩者配對)的共同乘積。i
i, t
(i, i)
i
t
i
i
uj5u.com熱心網友回復:
也許這會有所幫助
library(igraph)
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
dcast(
melt(dt[, id := 1:.N], "id")[
,
common := toString(names(V(g))[do.call(intersect, ego(g, nodes = value, mindist = 1))]),
id
],
id common ~ variable
)[, .(V1, V2, common)]
這使
V1 V2 common
1: G e
2: N L
3: Z G
4: u z
5: C d B
6: t D
7: w 8
8: e T
9: d v
10: 3 b
11: C y
12: A j
13: g M
14: N Q
15: l 9
16: U 0
17: i i l, z, 7, B, 5, E
18: z 6
19: N R S
20: m d
21: v z
22: D U
23: e U
24: 7 A
25: G k
26: N S R
27: 0 V
28: N C
29: r E
30: L a
31: T Z
32: b 4
33: U 2
34: B d C, 6
35: p v
36: f b
37: n Y
38: 6 W j
39: i z
40: P V
41: o g
42: e b
43: m E
44: Y G
45: W j 6
46: m S
47: 1 A
48: T k
49: j 6 W
50: g r
51: T c
52: r Y
53: R K
54: F S
55: 4 V
56: 6 B d
57: J W
58: W 4
59: f H
60: P D
61: u H
62: I t
63: S R N
64: K m
65: e s
66: F P
67: T 3
68: l K
69: 5 i
70: s K O
71: L d r
72: q q P
73: L r d
74: K O s
75: T N
76: t t D, I, x
77: r d L
78: O j
79: m b
80: x t
81: Q I
82: i B
83: O s K
84: K V
85: k s
86: C B d
87: i l
88: 7 i
89: F w
90: 8 X
91: E i
92: 3 O
93: d 6 B
94: s v
95: m H
96: n a
97: S 6
98: P q
99: o J
100: b m
V1 V2 common
uj5u.com熱心網友回復:
我相信您可以堅持使用 data.table 來完成您想做的事情。正如評論者已經指出的那樣,您需要確定產品配對 [a,b] 是否等同于 [b,a](在您的示例答案中,它僅適用于配對 [a,b])。無論如何,這個答案的瓶頸是Map()
電話;您可能可以從中獲得更快的速度,future_Map()
但您必須測驗您的實際資料以查看是否需要它。
我還想指出,我將常用產品列保留為串列列,盡管您可能希望它采用不同的格式。現在,當沒有匹配時,它是 NULL/空字符列的混合,因此如果您將其保留為串列列,您可能需要清理它 - 由您決定。
解決方案:
dt_unique = unique(dt[, .(V1, V2)])
dt_pairs = dt_unique[, list(ref_list = list(unique(V2))), .(product = V1)]
dt_unique = dt_pairs[dt_unique, on = c("product" = "V2")]
setnames(dt_unique, c("V2", "V2_ref", "V1"))
dt_unique = dt_pairs[dt_unique, on = c("product" = "V1")]
setnames(dt_unique, c("V1", "V1_ref", "V2", "V2_ref"))
dt_unique[, common_prods := Map(function(x, y) unique.default(y[chmatch(x, y, 0L)]), V1_ref, V2_ref)]
dt_unique[, c("V1_ref", "V2_ref") := NULL]
dt_unique[dt, on = c("V1", "V2")]
V1 V2 common_prods correct_common_prods
1: G e
2: N L
3: Z G
4: u z
5: C d
6: t D
7: w 8
8: e T
9: d v
10: 3 b
11: C y
12: A j
13: g M
14: N Q
15: l 9
16: U 0
17: i i i,z,B,l i, z, B, l
18: z 6
19: N R
20: m d
21: v z
22: D U
23: e U
24: 7 A
25: G k
26: N S R R
27: 0 V
28: N C
29: r E
30: L a
31: T Z
32: b 4
33: U 2
34: B d
35: p v
36: f b
37: n Y
38: 6 W
39: i z
40: P V
41: o g
42: e b
43: m E
44: Y G
45: W j
46: m S
47: 1 A
48: T k
49: j 6
50: g r
51: T c
52: r Y
53: R K
54: F S
55: 4 V
56: 6 B
57: J W
58: W 4
59: f H
60: P D
61: u H
62: I t t t
63: S R
64: K m
65: e s
66: F P
67: T 3
68: l K
69: 5 i i i
70: s K
71: L d
72: q q q q
73: L r d d
74: K O
75: T N
76: t t D,t D, t
77: r d
78: O j
79: m b
80: x t t t
81: Q I
82: i B
83: O s
84: K V
85: k s
86: C B d d
87: i l
88: 7 i i i
89: F w
90: 8 X
91: E i i i
92: 3 O
93: d 6
94: s v
95: m H
96: n a
97: S 6
98: P q q q
99: o J
100: b m
V1 V2 common_prods correct_common_prods
可重現的代碼(帶注釋):
library(data.table)
n = 1e2
set.seed(1)
dt <- data.table(V1 = stringi::stri_rand_strings(n, 1),
V2 = stringi::stri_rand_strings(n, 1))
#Matching your output:
find_products <- function(a, b){
library(data.table)
toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}
dt[, correct_common_prods := apply(dt, 1, function(y) find_products(y[['V1']], y[['V2']]))]
# If (a, b) and (b, a) are equivalent, you'll want this instead:
# dt_unique = unique(rbindlist(list(dt[, .(V1, V2)], dt[, .(V2, V1)]), use.names = FALSE))
dt_unique = unique(dt[, .(V1, V2)])
# Creating list-column w/ corresponding products
dt_pairs = dt_unique[, list(ref_list = list(unique(V2))), .(product = V1)]
# Merging and re-naming. There may be a more data.table way to
# handle the renaming because this feels not-eloquent
dt_unique = dt_pairs[dt_unique, on = c("product" = "V2")]
setnames(dt_unique, c("V2", "V2_ref", "V1"))
dt_unique = dt_pairs[dt_unique, on = c("product" = "V1")]
setnames(dt_unique, c("V1", "V1_ref", "V2", "V2_ref"))
# This is the memory-intensive part because it checks for the intersection on
# each row. This creates a list-column `common_prods`
# OR, easier to read but slower:
# dt_unique[, common_prods := Map(intersect, V1_ref, V2_ref)]
dt_unique[, common_prods := Map(function(x, y) unique.default(y[chmatch(x, y, 0L)]), V1_ref, V2_ref)]
# Column cleanup (retain _ref columns to better understand how this works)
# then merging in the common products
dt_unique[, c("V1_ref", "V2_ref") := NULL]
dt = dt_unique[dt, on = c("V1", "V2")]
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/457305.html