索引匹配大型数组Google脚本花费的时间很长

索引匹配大型数组Google脚本花费的时间很长

本文介绍了索引匹配大型数组Google脚本花费的时间很长的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在下面的功能中尝试刮取4个网站,然后将结果合并到电子表格中.有没有更快的方法来匹配不是INDEX/MATCH公式的大型数组.我想要的输出是(上面是一个例子)

I have the function below where I am trying to scrape 4 websites, and then combine the results into a spreadsheet. Is there a faster way to match over a large array that isn't the INDEX/MATCH formulas. My desired output would be (obv this is an example)

MLBID | FG_ID | PA | K | K%| wOBA

MLBID | FG_ID | PA | K | K% | wOBA

12345 | 12345 | 12 | 5 | 41.7%| .300

12345 | 12345 | 12 | 5 | 41.7% | .300

虽然我下面的代码有效,但要达到Google脚本的6分钟限制,所需的时间太长了.我正在尝试进行的匹配是使用〜4000行.我已经尽可能地注释了我的代码.

While the code I have below works, it takes wayyyy too long reaches the 6-minute limit of Google Script. The matching that I am trying to do is with ~4000 rows. I have commented my code as much as possible.

function minors_batting_stats() {

  //this is the spreadsheet where I have a list of all of the IDs -- MLB and FG
  var ids = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Player List");

  //this is the output sheet
  var mb18vR_sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("2018 minors bat vs R");

  //various URLs I am trying to scrape
  var mb18vR_PA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
  var mb18vR_SO_url = 'https://www.mlb.com/prospects/stats/search?pa_result=strikeout&level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
  var mb18vR_wOBA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=woba&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'

  //creating an array for each scrape
  var res = [];
  var res1 = [];
  var res2 = [];
  var res3 = [];

  //getting the MLB and FG ids from the spreadsheet
  var mlbids = ids.getRange(1, 11, ids.getLastRow()).getValues();
  var fgids = ids.getRange(1,9, ids.getLastRow()).getValues();

  //scraping SO against RHP
  var content_SO = UrlFetchApp.fetch(mb18vR_SO_url).getContentText();
  var e_SO = Parser.data(content_SO).from('tbody').to('</tbody>').build();
  var rows_SO = Parser.data(e_SO).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows_SO.length; i++) { //rows.length
    res1[i] = [];
    res1[i][0] = Parser.data(rows_SO[i]).from('/player/').to('/').build();

    var SOs = Parser.data(rows_SO[i]).from('<td align="left">').to('</td>').iterate();
    res1[i][1] = SOs[1];
  }

  //scraping wOBA against RHP
  var content_wOBA = UrlFetchApp.fetch(mb18vR_wOBA_url).getContentText();
  var e_wOBA = Parser.data(content_wOBA).from('tbody').to('</tbody>').build();
  var rows_wOBA = Parser.data(e_wOBA).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows_wOBA.length; i++) { //rows.length
    res2[i] = [];
    res2[i][0] = Parser.data(rows_wOBA[i]).from('/player/').to('/').build();

    var wOBAs = Parser.data(rows_wOBA[i]).from('<td align="left">').to('</td>').iterate();
    res2[i][1] = wOBAs[2];
  }

  //scraping PA against RHP
  var content = UrlFetchApp.fetch(mb18vR_PA_url).getContentText();
  var e = Parser.data(content).from('tbody').to('</tbody>').build();
  var rows = Parser.data(e).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows.length; i++) { //rows.length
    res[i] = [];
    res[i][0] = Parser.data(rows[i]).from('/player/').to('/').build();
    res[i][1] = [];

    //matching the MLB_ID with FG_ID
    var mlbID = res[i][0];
    for(var j = 0; j<mlbids.length;j++){
      if(mlbids[j] == mlbID){
        res[i][1] = fgids[j];
      }
    }

    var PAs = Parser.data(rows[i]).from('<td align="left">').to('</td>').iterate();
    res[i][2] = PAs[1];

    //matching the MLB_ID from PA (res) with SO (res1)
    res[i][3] = 0;
    for (var w=0; w<res1.length; w++) {
      if (res[i][0] == res1[w][0]) {
        res[i][3] = res1[w][1];
      }
    }

    //Calculating K%
    res[i][4] = res[i][3] / res[i][2]

    //matching the MLB_ID from PA (res) with wOBA (res1)
    res[i][5] = 0;
    for (var v=0; v<res2.length; v++) {
      if (res[i][0] == res2[v][0]) {
        res[i][5] = res2[v][1];
      }
    }
  }

  //pasting values
  mb18vR_sheet.getRange(2, 1, res.length, res[0].length).setValues(res);
}

推荐答案

您遇到的问题是,对于比较数据的每一行,您正在强制脚本多次遍历大型数据集.更好的方法是构建一个查找对象,该对象在所需的唯一标识符和您要访问的数据数组的行之间映射:

The issue you have is that you are forcing your script to loop through large datasets many many times for each row of compared data. A better approach is to build a lookup object, which maps between a desired unique identifier and the row of the data array you want to access:

/* Make an object from an Array[][] that has a unique identifier in one of the columns.
 * @param Array[][] data       The 2D array of data to index, e.g. [ [r1c1, r1c2, ...], [r2c1, r2c2, ...], ... ]
 * @param Integer   idColumn   The column in the data array that is a unique row identifier
                               e.g. the column index that contains the product's serial number, in a data
                               array that has only a single row per unique product.
   @return Object {}           An object that maps between an id and a row index, such that
                               `object[id]` = the row index for the specific row in data that has id = id
 */
function makeKey(data, idColumn) {
  if(!data || !data.length || !data[0].length)
    throw new ValueError("Input data argument is not Array[][]");
  // Assume the first column is the column with the unique identifier if not given by the caller.
  if(idColumn === undefined)
    idColumn = 0;

  var key = {};
  for(var r = 0, rows = data.length; r < rows; ++r) {
    var id = data[r][idColumn];
    if (key[id])
      throw new ValueError("ID is not unique for id='" + id + "'");

    key[id] = r;
  }
  return key;
}

用法:

var database = someSheet.getDataRange().getValues();
var lookup = makeKey(database, 3); // here we say that the 4th column has the unique values.

var newData = /* read a 2D array from somewhere */;
for(var r = 0, rows < newData.length; r < rows; ++r) {
  var id = newData[r][3];
  var existingIndex = lookup[id];
  if (existingIndex) {
    var oldDataRow = database[existingIndex];
  } else {
    // No existing data.
  }
}

通过为数据数组创建查找对象,您不再需要重新搜索它们并进行比较,因为您只搜索了一次并存储了关系,而不是每次都丢弃它.请注意,所创建的密钥基于数据的特定(且唯一)属性.没有这种关系,这种特殊的索引编制方法将行不通-但另一种方法将行得通.

By making a lookup object for your data arrays, you no longer have to re-search them and make comparisons, because you did the search once and stored the relationship, rather than discarding it every time. Note that the key that was made is based on a specific (and unique) property of the data. Without that relationship, this particular indexing approach won't work - but a different one will.

这篇关于索引匹配大型数组Google脚本花费的时间很长的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

08-15 00:26
查看更多