问题描述
我有下面的功能,我试图抓取 4 个网站,然后将结果合并到一个电子表格中.是否有更快的方法来匹配不是 INDEX/MATCH 公式的大型数组.我想要的输出是(obv 这是一个例子)
I have the function below where I am trying to scrape 4 websites, and then combine the results into a spreadsheet. Is there a faster way to match over a large array that isn't the INDEX/MATCH formulas. My desired output would be (obv this is an example)
MLBID |FG_ID |PA |克|K% |沃巴
MLBID | FG_ID | PA | K | K% | wOBA
12345 |12345 |12 |5 |41.7% |.300
12345 | 12345 | 12 | 5 | 41.7% | .300
虽然我下面的代码可以工作,但达到 Google Script 的 6 分钟限制需要很长时间.我试图做的匹配是 ~4000 行.我已经尽可能多地注释了我的代码.
While the code I have below works, it takes wayyyy too long reaches the 6-minute limit of Google Script. The matching that I am trying to do is with ~4000 rows. I have commented my code as much as possible.
function minors_batting_stats() {
//this is the spreadsheet where I have a list of all of the IDs -- MLB and FG
var ids = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Player List");
//this is the output sheet
var mb18vR_sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("2018 minors bat vs R");
//various URLs I am trying to scrape
var mb18vR_PA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
var mb18vR_SO_url = 'https://www.mlb.com/prospects/stats/search?pa_result=strikeout&level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
var mb18vR_wOBA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=woba&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
//creating an array for each scrape
var res = [];
var res1 = [];
var res2 = [];
var res3 = [];
//getting the MLB and FG ids from the spreadsheet
var mlbids = ids.getRange(1, 11, ids.getLastRow()).getValues();
var fgids = ids.getRange(1,9, ids.getLastRow()).getValues();
//scraping SO against RHP
var content_SO = UrlFetchApp.fetch(mb18vR_SO_url).getContentText();
var e_SO = Parser.data(content_SO).from('tbody').to('</tbody>').build();
var rows_SO = Parser.data(e_SO).from('<tr class="player_row"').to('</tr>').iterate();
for (var i=0; i<rows_SO.length; i++) { //rows.length
res1[i] = [];
res1[i][0] = Parser.data(rows_SO[i]).from('/player/').to('/').build();
var SOs = Parser.data(rows_SO[i]).from('<td align="left">').to('</td>').iterate();
res1[i][1] = SOs[1];
}
//scraping wOBA against RHP
var content_wOBA = UrlFetchApp.fetch(mb18vR_wOBA_url).getContentText();
var e_wOBA = Parser.data(content_wOBA).from('tbody').to('</tbody>').build();
var rows_wOBA = Parser.data(e_wOBA).from('<tr class="player_row"').to('</tr>').iterate();
for (var i=0; i<rows_wOBA.length; i++) { //rows.length
res2[i] = [];
res2[i][0] = Parser.data(rows_wOBA[i]).from('/player/').to('/').build();
var wOBAs = Parser.data(rows_wOBA[i]).from('<td align="left">').to('</td>').iterate();
res2[i][1] = wOBAs[2];
}
//scraping PA against RHP
var content = UrlFetchApp.fetch(mb18vR_PA_url).getContentText();
var e = Parser.data(content).from('tbody').to('</tbody>').build();
var rows = Parser.data(e).from('<tr class="player_row"').to('</tr>').iterate();
for (var i=0; i<rows.length; i++) { //rows.length
res[i] = [];
res[i][0] = Parser.data(rows[i]).from('/player/').to('/').build();
res[i][1] = [];
//matching the MLB_ID with FG_ID
var mlbID = res[i][0];
for(var j = 0; j<mlbids.length;j++){
if(mlbids[j] == mlbID){
res[i][1] = fgids[j];
}
}
var PAs = Parser.data(rows[i]).from('<td align="left">').to('</td>').iterate();
res[i][2] = PAs[1];
//matching the MLB_ID from PA (res) with SO (res1)
res[i][3] = 0;
for (var w=0; w<res1.length; w++) {
if (res[i][0] == res1[w][0]) {
res[i][3] = res1[w][1];
}
}
//Calculating K%
res[i][4] = res[i][3] / res[i][2]
//matching the MLB_ID from PA (res) with wOBA (res1)
res[i][5] = 0;
for (var v=0; v<res2.length; v++) {
if (res[i][0] == res2[v][0]) {
res[i][5] = res2[v][1];
}
}
}
//pasting values
mb18vR_sheet.getRange(2, 1, res.length, res[0].length).setValues(res);
}
推荐答案
您遇到的问题是,您正在强制脚本为每行比较数据多次循环遍历大型数据集.更好的方法是构建一个查找对象,该对象在所需的唯一标识符和要访问的数据数组的行之间进行映射:
The issue you have is that you are forcing your script to loop through large datasets many many times for each row of compared data. A better approach is to build a lookup object, which maps between a desired unique identifier and the row of the data array you want to access:
/* Make an object from an Array[][] that has a unique identifier in one of the columns.
* @param Array[][] data The 2D array of data to index, e.g. [ [r1c1, r1c2, ...], [r2c1, r2c2, ...], ... ]
* @param Integer idColumn The column in the data array that is a unique row identifier
e.g. the column index that contains the product's serial number, in a data
array that has only a single row per unique product.
@return Object {} An object that maps between an id and a row index, such that
`object[id]` = the row index for the specific row in data that has id = id
*/
function makeKey(data, idColumn) {
if(!data || !data.length || !data[0].length)
throw new ValueError("Input data argument is not Array[][]");
// Assume the first column is the column with the unique identifier if not given by the caller.
if(idColumn === undefined)
idColumn = 0;
var key = {};
for(var r = 0, rows = data.length; r < rows; ++r) {
var id = data[r][idColumn];
if (key[id])
throw new ValueError("ID is not unique for id='" + id + "'");
key[id] = r;
}
return key;
}
用法:
var database = someSheet.getDataRange().getValues();
var lookup = makeKey(database, 3); // here we say that the 4th column has the unique values.
var newData = /* read a 2D array from somewhere */;
for(var r = 0, rows < newData.length; r < rows; ++r) {
var id = newData[r][3];
var existingIndex = lookup[id];
if (existingIndex) {
var oldDataRow = database[existingIndex];
} else {
// No existing data.
}
}
通过为您的数据数组创建查找对象,您不再需要重新搜索它们并进行比较,因为您只搜索了一次并存储了关系,而不是每次都丢弃它.请注意,生成的密钥基于数据的特定(且唯一)属性.没有这种关系,这种特殊的索引方法将无法工作 - 但另一种方法会.
By making a lookup object for your data arrays, you no longer have to re-search them and make comparisons, because you did the search once and stored the relationship, rather than discarding it every time. Note that the key that was made is based on a specific (and unique) property of the data. Without that relationship, this particular indexing approach won't work - but a different one will.
这篇关于索引匹配大数组谷歌脚本需要很长时间的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!