R-使用apply加速搜索-有解無憂

賞金將在 6 天后到期。此問題的答案有資格獲得 50聲望賞金。 wake_wake正在尋找一個規范的答案。

假設我有一個data.table觀察結果是我的消費者一起購買的產品的成對組合。

dt對于我的中的每對產品（在中的一行），我想查找data.table它們是否有第三種共同的產品，有時也與其中一種產品一起購買。

我想將“常見產品”作為新列包含在dt.

目前，我這樣做如下。但我的真實資料包含數百萬行。從 1 周開始計算資料需要 20 小時。

我怎樣才能加快速度？一個apply函式是智能的，還是我應該考慮映射？

模擬示例：

library(data.table)
library(stringi)
library(future.apply)

set.seed(1)

# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
                 V2 = stri_rand_strings(100, 1))

head(dt,17)
#   V1 V2
#1:  G  e
#2:  N  L
#3:  Z  G
#4:  u  z
#5:  C  d
#6:  t  D
# 7:  w  8
# 8:  e  T
# 9:  d  v
#10:  3  b
#11:  C  y
#12:  A  j
#13:  g  M
#14:  N  Q
#15:  l  9
#16:  U  0
#17:  i  i

#function to find common products
find_products <- function(a, b){
  library(data.table)
  toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}

#initiate parallel processing
plan(multisession) # on Windows machine - use plan(multicore) on Linux

#apply function across rows
common_products <- future_apply(dt, 1, function(y) find_products(y['V1'], y['V2']))

dt_final <- cbind(dt, common_products)

#head(dt, 17)
#    V1 V2 common_products
# 1:  G  e                
# 2:  N  L                
# 3:  Z  G                
# 4:  u  z                
# 5:  C  d                
# 6:  t  D                
# 7:  w  8                
# 8:  e  T                
# 9:  d  v                
#10:  3  b                
#11:  C  y                
#12:  A  j                
#13:  g  M                
#14:  N  Q                
#15:  l  9                
#16:  U  0                
#17:  i  i      i, z, B, l

uj5u.com熱心網友回復：

可以將三向對視為無向圖中的三角形。該包igraph可以有效地找到這些。我包含了一個包含 20M 對 3 字符產品代碼的示例。它在大約 70 秒內在單個執行緒上運行。

library(data.table)
library(stringi)
library(igraph)

getpaired <- function(g, id) names(unique(neighbors(g, id)))
commonProducts <- function(dt) {
  blnSort <- dt$V1 > dt$V2
  dt[blnSort, c("V2", "V1") := list(V1, V2)] # sort each row
  # get triangles
  g <- graph_from_data_frame(dt, FALSE)
  m <- matrix(V(g)$name[triangles(g)], ncol = 3, byrow = TRUE)
  # sort each row
  m <- matrix(m[order(row(m), m, method = "radix")], ncol = 3, byrow = TRUE)
  dt3 <- as.data.table(m)
  # map common products back to the original dataframe
  dt3 <- rbindlist(
    list(
      # the three ordered pairs in each triangle
      dt3,
      dt3[, c(1, 3, 2)],
      dt3[, c(2, 3, 1)],
      # common products in "two-sided" triangles
      dt[V1 == V2][
        , .(V2 = V2, V3 = .(getpaired(g, V1))), by = "V1"
      ][
        , .(V1 = rep(rep.int(V1, lengths(V3)), 2),
            V2 = c(rep.int(V1, lengths(V3)), unlist(V3)),
            V3 = c(unlist(V3), rep.int(V1, lengths(V3))))
      ][ # sort (V1, V2) in each row
        V1 > V2, c("V2", "V1") := list(V1, V2)
      ]
    ),
    FALSE # bind by index
  )[ # collapse common products into a single vector for each pair
    , .(V3 = .(V3)),
    by = c("V1", "V2")
  ][ # join into the original (row-sorted) data.table
    dt, on = c("V1", "V2")
  ][ # unsort V1, V2 in each row to match the original (unsorted) data.table
    , c("V1", "V2") := dt[blnSort, c("V2", "V1") := list(V1, V2)]
  ]
}

set.seed(1)

# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
                 V2 = stri_rand_strings(100, 1))

dt3 <- commonProducts(dt)
print(dt3)
#>      V1 V2              V3
#>   1:  G  e                
#>   2:  N  L                
#>   3:  Z  G                
#>   4:  u  z                
#>   5:  C  d               B
#>   6:  t  D               t
#>   7:  w  8                
#>   8:  e  T                
#>   9:  d  v                
#>  10:  3  b                
#>  11:  C  y                
#>  12:  A  j                
#>  13:  g  M                
#>  14:  N  Q                
#>  15:  l  9                
#>  16:  U  0                
#>  17:  i  i i,7,E,B,5,z,...
#>  18:  z  6                
#>  19:  N  R               S
#>  20:  m  d                
#>  21:  v  z                
#>  22:  D  U                
#>  23:  e  U                
#>  24:  7  A                
#>  25:  G  k                
#>  26:  N  S               R
#>  27:  0  V                
#>  28:  N  C                
#>  29:  r  E                
#>  30:  L  a                
#>  31:  T  Z                
#>  32:  b  4                
#>  33:  U  2                
#>  34:  B  d             C,6
#>  35:  p  v                
#>  36:  f  b                
#>  37:  n  Y                
#>  38:  6  W               j
#>  39:  i  z               i
#>  40:  P  V                
#>  41:  o  g                
#>  42:  e  b                
#>  43:  m  E                
#>  44:  Y  G                
#>  45:  W  j                
#>  46:  m  S                
#>  47:  1  A                
#>  48:  T  k                
#>  49:  j  6               W
#>  50:  g  r                
#>  51:  T  c                
#>  52:  r  Y                
#>  53:  R  K                
#>  54:  F  S                
#>  55:  4  V                
#>  56:  6  B               d
#>  57:  J  W                
#>  58:  W  4                
#>  59:  f  H                
#>  60:  P  D                
#>  61:  u  H                
#>  62:  I  t               t
#>  63:  S  R               N
#>  64:  K  m                
#>  65:  e  s                
#>  66:  F  P                
#>  67:  T  3                
#>  68:  l  K                
#>  69:  5  i               i
#>  70:  s  K               O
#>  71:  L  d                
#>  72:  q  q           P,q,q
#>  73:  L  r               d
#>  74:  K  O               s
#>  75:  T  N                
#>  76:  t  t       D,I,t,x,t
#>  77:  r  d               L
#>  78:  O  j                
#>  79:  m  b                
#>  80:  x  t               t
#>  81:  Q  I                
#>  82:  i  B               i
#>  83:  O  s               K
#>  84:  K  V                
#>  85:  k  s                
#>  86:  C  B               d
#>  87:  i  l               i
#>  88:  7  i               i
#>  89:  F  w                
#>  90:  8  X                
#>  91:  E  i               i
#>  92:  3  O                
#>  93:  d  6               B
#>  94:  s  v                
#>  95:  m  H                
#>  96:  n  a                
#>  97:  S  6                
#>  98:  P  q               q
#>  99:  o  J                
#> 100:  b  m                
#>      V1 V2              V3

# timing a much larger dataset
dt <- data.table(V1 = stri_rand_strings(2e7, 3),
                 V2 = stri_rand_strings(2e7, 3))

system.time(dt3 <- commonProducts(dt))
#>    user  system elapsed 
#>   72.75    3.05   71.88
dt3[lengths(V3) != 0L] # show only those pairs with common products
#>           V1  V2  V3
#>       1: GBW mDN lxF
#>       2: ix6 jpR 0VI
#>       3: xLG VeE aik
#>       4: A36 RzJ YYu
#>       5: zAo OYu zAo
#>      ---            
#> 1841567: qX9 xrW 7lb
#> 1841568: knO 2G6 knO
#> 1841569: rsU 5Rw ER8
#> 1841570: Bts 3L1 1bQ
#> 1841571: c0h pgd jxJ

V1這處理==時創建的“雙面三角形” V2（與 OP 示例資料中的第 17 行一樣）。例如，如果整個資料集由和對組成(t, i)，(i, i)那么i將是(t, i)(i與兩者配對) 的共同乘積，并且t將是(并且每個都與兩者配對)的共同乘積。ii, t(i, i)itii

uj5u.com熱心網友回復：

也許這會有所幫助

library(igraph)

g <- simplify(graph_from_data_frame(dt, directed = FALSE))
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
dcast(
    melt(dt[, id := 1:.N], "id")[
        ,
        common := toString(names(V(g))[do.call(intersect, ego(g, nodes = value, mindist = 1))]),
        id
    ],
    id   common ~ variable
)[, .(V1, V2, common)]

這使

     V1 V2           common
  1:  G  e
  2:  N  L
  3:  Z  G
  4:  u  z
  5:  C  d                B
  6:  t  D
  7:  w  8
  8:  e  T
  9:  d  v
 10:  3  b
 11:  C  y
 12:  A  j
 13:  g  M
 14:  N  Q
 15:  l  9
 16:  U  0
 17:  i  i l, z, 7, B, 5, E
 18:  z  6
 19:  N  R                S
 20:  m  d
 21:  v  z
 22:  D  U
 23:  e  U
 24:  7  A
 25:  G  k
 26:  N  S                R
 27:  0  V
 28:  N  C
 29:  r  E
 30:  L  a
 31:  T  Z
 32:  b  4
 33:  U  2
 34:  B  d             C, 6
 35:  p  v
 36:  f  b
 37:  n  Y
 38:  6  W                j
 39:  i  z
 40:  P  V
 41:  o  g
 42:  e  b
 43:  m  E
 44:  Y  G
 45:  W  j                6
 46:  m  S
 47:  1  A
 48:  T  k
 49:  j  6                W
 50:  g  r
 51:  T  c
 52:  r  Y
 53:  R  K
 54:  F  S
 55:  4  V
 56:  6  B                d
 57:  J  W
 58:  W  4
 59:  f  H
 60:  P  D
 61:  u  H
 62:  I  t
 63:  S  R                N
 64:  K  m
 65:  e  s
 66:  F  P
 67:  T  3
 68:  l  K
 69:  5  i
 70:  s  K                O
 71:  L  d                r
 72:  q  q                P
 73:  L  r                d
 74:  K  O                s
 75:  T  N
 76:  t  t          D, I, x
 77:  r  d                L
 78:  O  j
 79:  m  b
 80:  x  t
 81:  Q  I
 82:  i  B
 83:  O  s                K
 84:  K  V
 85:  k  s
 86:  C  B                d
 87:  i  l
 88:  7  i
 89:  F  w
 90:  8  X
 91:  E  i
 92:  3  O
 93:  d  6                B
 94:  s  v
 95:  m  H
 96:  n  a
 97:  S  6
 98:  P  q
 99:  o  J
100:  b  m
     V1 V2           common

uj5u.com熱心網友回復：

我相信您可以堅持使用 data.table 來完成您想做的事情。正如評論者已經指出的那樣，您需要確定產品配對 [a,b] 是否等同于 [b,a]（在您的示例答案中，它僅適用于配對 [a,b]）。無論如何，這個答案的瓶頸是Map()電話；您可能可以從中獲得更快的速度，future_Map()但您必須測驗您的實際資料以查看是否需要它。

我還想指出，我將常用產品列保留為串列列，盡管您可能希望它采用不同的格式。現在，當沒有匹配時，它是 NULL/空字符列的混合，因此如果您將其保留為串列列，您可能需要清理它 - 由您決定。

解決方案：

dt_unique = unique(dt[, .(V1, V2)])
dt_pairs = dt_unique[, list(ref_list = list(unique(V2))), .(product = V1)]

dt_unique = dt_pairs[dt_unique, on = c("product" = "V2")]
setnames(dt_unique, c("V2", "V2_ref", "V1"))
dt_unique = dt_pairs[dt_unique, on = c("product" = "V1")]
setnames(dt_unique, c("V1", "V1_ref", "V2", "V2_ref"))

dt_unique[, common_prods := Map(function(x, y) unique.default(y[chmatch(x, y, 0L)]), V1_ref, V2_ref)]

dt_unique[, c("V1_ref", "V2_ref") := NULL]
dt_unique[dt, on = c("V1", "V2")]

     V1 V2 common_prods correct_common_prods
  1:  G  e                                  
  2:  N  L                                  
  3:  Z  G                                  
  4:  u  z                                  
  5:  C  d                                  
  6:  t  D                                  
  7:  w  8                                  
  8:  e  T                                  
  9:  d  v                                  
 10:  3  b                                  
 11:  C  y                                  
 12:  A  j                                  
 13:  g  M                                  
 14:  N  Q                                  
 15:  l  9                                  
 16:  U  0                                  
 17:  i  i      i,z,B,l           i, z, B, l
 18:  z  6                                  
 19:  N  R                                  
 20:  m  d                                  
 21:  v  z                                  
 22:  D  U                                  
 23:  e  U                                  
 24:  7  A                                  
 25:  G  k                                  
 26:  N  S            R                    R
 27:  0  V                                  
 28:  N  C                                  
 29:  r  E                                  
 30:  L  a                                  
 31:  T  Z                                  
 32:  b  4                                  
 33:  U  2                                  
 34:  B  d                                  
 35:  p  v                                  
 36:  f  b                                  
 37:  n  Y                                  
 38:  6  W                                  
 39:  i  z                                  
 40:  P  V                                  
 41:  o  g                                  
 42:  e  b                                  
 43:  m  E                                  
 44:  Y  G                                  
 45:  W  j                                  
 46:  m  S                                  
 47:  1  A                                  
 48:  T  k                                  
 49:  j  6                                  
 50:  g  r                                  
 51:  T  c                                  
 52:  r  Y                                  
 53:  R  K                                  
 54:  F  S                                  
 55:  4  V                                  
 56:  6  B                                  
 57:  J  W                                  
 58:  W  4                                  
 59:  f  H                                  
 60:  P  D                                  
 61:  u  H                                  
 62:  I  t            t                    t
 63:  S  R                                  
 64:  K  m                                  
 65:  e  s                                  
 66:  F  P                                  
 67:  T  3                                  
 68:  l  K                                  
 69:  5  i            i                    i
 70:  s  K                                  
 71:  L  d                                  
 72:  q  q            q                    q
 73:  L  r            d                    d
 74:  K  O                                  
 75:  T  N                                  
 76:  t  t          D,t                 D, t
 77:  r  d                                  
 78:  O  j                                  
 79:  m  b                                  
 80:  x  t            t                    t
 81:  Q  I                                  
 82:  i  B                                  
 83:  O  s                                  
 84:  K  V                                  
 85:  k  s                                  
 86:  C  B            d                    d
 87:  i  l                                  
 88:  7  i            i                    i
 89:  F  w                                  
 90:  8  X                                  
 91:  E  i            i                    i
 92:  3  O                                  
 93:  d  6                                  
 94:  s  v                                  
 95:  m  H                                  
 96:  n  a                                  
 97:  S  6                                  
 98:  P  q            q                    q
 99:  o  J                                  
100:  b  m                                  
     V1 V2 common_prods correct_common_prods

可重現的代碼（帶注釋）：

library(data.table)

n = 1e2
set.seed(1)

dt <- data.table(V1 = stringi::stri_rand_strings(n, 1),
                 V2 = stringi::stri_rand_strings(n, 1))

#Matching your output:
find_products <- function(a, b){
  library(data.table)
  toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}
dt[, correct_common_prods := apply(dt, 1, function(y) find_products(y[['V1']], y[['V2']]))]


# If (a, b) and (b, a) are equivalent, you'll want this instead:
# dt_unique = unique(rbindlist(list(dt[, .(V1, V2)], dt[, .(V2, V1)]), use.names = FALSE))
dt_unique = unique(dt[, .(V1, V2)])

# Creating list-column w/ corresponding products
dt_pairs = dt_unique[, list(ref_list = list(unique(V2))), .(product = V1)]

# Merging and re-naming. There may be a more data.table way to 
# handle the renaming because this feels not-eloquent
dt_unique = dt_pairs[dt_unique, on = c("product" = "V2")]
setnames(dt_unique, c("V2", "V2_ref", "V1"))
dt_unique = dt_pairs[dt_unique, on = c("product" = "V1")]
setnames(dt_unique, c("V1", "V1_ref", "V2", "V2_ref"))

# This is the memory-intensive part because it checks for the intersection on
# each row. This creates a list-column `common_prods`
# OR, easier to read but slower: 
# dt_unique[, common_prods := Map(intersect, V1_ref, V2_ref)]
dt_unique[, common_prods := Map(function(x, y) unique.default(y[chmatch(x, y, 0L)]), V1_ref, V2_ref)]

# Column cleanup (retain _ref columns to better understand how this works)
# then merging in the common products
dt_unique[, c("V1_ref", "V2_ref") := NULL]
dt = dt_unique[dt, on = c("V1", "V2")]

轉載請註明出處，本文鏈接：https://www.uj5u.com/shujuku/457305.html

標籤：r 表现数据表查询优化申请

上一篇：加速處理32位組合數（kfromn）

下一篇：jetpackcompose中的字母滾動條