在hadoop上解析Stackoverflow的posts

在hadoop上解析Stackoverflow的posts

本文介绍了在hadoop上解析Stackoverflow的posts.xml的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我遵循这个 Anoop Madhusudanan在codeproject上建立一个推荐引擎,而不是在集群上,但在我的系统上。

问题是当我尝试解析posts.xml,其结构如下所示:

 < row Id =99PostTypeId =2ParentId =88CreationDate =2008-08-01T14:55 :getstyleofday()的实际分辨率取决于硬件体系结构。实际分辨率取决于硬件体系结构。英特尔处理器和SPARC机器提供高分辨率计时器,可测量微秒,其他硬件体系结构可回退到系统的定时器,定时器通常设置为100Hz,在这种情况下,时间分辨率将不太准确。& lt; / p&安培; GT;&安培; #xD;&安培; #xA;&安培; LT; / BLOCKQUOTE&安培; GT;&安培; #xD;&安培; #xA;&安培; #xD;&安培; #xA;&安培; LT; p&安培; GT ;我获得了t他的回答是& lt; a href =& amp; quot; http://www.informit.com/guides/content.aspx?g = cplusplus& amp; amp; amp; amp; amp; amp; amp; amp; amp; amp; amp; amp; amp; amp; < / a& gt;< / p& gt;OwnerUserId =25LastActivityDate =2008 -08-01T14:55:08.477/> 

现在我需要解析这个文件1.4 gb)在hadoop上,我已经用java编写了代码并创建了jar。
Java类如下:

  import java.io.IOException; 
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
$ b $ import java.io.文件;


导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.FileSystem;
导入org.apache.hadoop .fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
导入org.apache .hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;


公共类推荐{

static class Map扩展Mapper< Text,Text,Text,Text> {
路径路径;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
文档文档;

/ **
*给定一个输出文件名,给它写一堆随机记录。
* /
public void map(LongWritable key,Text value,
OutputCollector< Text,Text> output,Reporter reporter)throws IOException {
try {
fXmlFile = value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder = dbFactory.newDocumentBuilder();
doc = dBuilder.parse(fXmlFile);

doc.getDocumentElement()。normalize();
NodeList nList = doc.getElementsByTagName(row); (int temp = 0; temp< nList.getLength(); temp ++){

Node nNode = nList.item(temp);


Element eElement =(Element)nNode;

Text keyWords = new Text(eElement.getAttribute(OwnerUserId));
Text valueWords = new Text(eElement.getAttribute(ParentId));
String val = keyWords.toString()++ valueWords.toString();
//写下句子
if(keyWords!= null&& valueWords!= null){
output.collect(keyWords,new Text(val));
}
}

} catch(Exception e){
e.printStackTrace();
}
}
}

/ **
*
* @throws IOException
* /
public static void main(String [] args)throws Exception {
Configuration conf = new Configuration();
// String [] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
/ * if(args.length!= 2){
System.err.println(用法:wordcount< in> out>);
System.exit(2);
} * /
// FileSystem fs = FileSystem.get(conf);
工作职位=新职位(conf,推荐);
job.setJarByClass(Recommend.class);

//键是单词(字符串)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);

//这些值是计数(整数)
job.setOutputValueClass(Text.class);

job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);

FileInputFormat.addInputPath(job,new Path(args [0]));
FileOutputFormat.setOutputPath(job,new Path(args [1]));

System.exit(job.waitForCompletion(true)?0:1);
路径outPath = new Path(args [1]);
FileSystem dfs = FileSystem.get(outPath.toUri(),conf);
if(dfs.exists(outPath)){
dfs.delete(outPath,true);
}
}
}

我预计输出是作为hadoop中的一个文件,其中包含输出为 OwnerUserId ParentId
,但是我输出为:

  1599788   

我不知道的来源1599788 作为映射器的关键值出现。



我不知道为hadoop编写mapper类,我需要帮助修改我的代码以获得所需的输出。



预先感谢。

解决方案

经过大量研究和实验,终于学会了编写parsin xml文件的映射方式,这些文件的语法与我提供的相似。我改变了我的方法,这是我的新的映射代码...它为我的用例工作。



希望它可以帮助别人,他们可以节省时间:)

  import java.io.IOException; 
import java.util.StringTokenizer;

import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;

public class Map扩展Mapper< LongWritable,Text,NullWritable,Text> {
NullWritable obj;
$ b @Override $ b $ public void map(LongWritable key,Text value,Context context)throws InterruptedException {
StringTokenizer tok = new StringTokenizer(value.toString());
String pa = null,ow = null,pi = null,v;
while(tok.hasMoreTokens()){
String [] arr;
String val =(String)tok.nextToken();
if(val.contains(PostTypeId)){
arr = val.split([\]);
pi = arr [arr.length-1];
if(pi.equals(2)){
continue;
}
else break;
}
if(val.contains( ParentId)){
arr = val.split([\]);
pa = arr [arr.length-1];
}
else if(val.contains(OwnerUserId)){
arr = val.split([\]);
ow = arr [arr .length-1];
尝试{
if(pa!= null&< ow!= null){
v = String.format({0},{1} ,ow,pa);
context.write(obj,new Text(v));

}
} catch(IOException e){
// TODO自动生成的catch块
e.printStackTrace();
}
}
}


}

}


I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.

Problem is when i try to parse posts.xml whose structure is as follows:

 <row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="&lt;blockquote&gt;&#xD;&#xA;  &lt;p&gt;The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. &lt;/p&gt;&#xD;&#xA;&lt;/blockquote&gt;&#xD;&#xA;&#xD;&#xA;&lt;p&gt;I obtained this answer from &lt;a href=&quot;http://www.informit.com/guides/content.aspx?g=cplusplus&amp;amp;seqNum=272&quot; rel=&quot;nofollow&quot;&gt;High Resolution Time Measurement and Timers, Part I&lt;/a&gt;&lt;/p&gt;" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />

Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.Java class is as follows:

import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;

import java.io.File;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;


public class Recommend {

    static class Map extends Mapper<Text, Text, Text, Text> {
        Path path;
        String fXmlFile;
        DocumentBuilderFactory dbFactory;
        DocumentBuilder dBuilder;
        Document doc;

        /**
         * Given an output filename, write a bunch of random records to it.
         */
        public void map(LongWritable key, Text value,
                OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
            try{
                fXmlFile=value.toString();
                dbFactory = DocumentBuilderFactory.newInstance();
                dBuilder= dbFactory.newDocumentBuilder();
                doc= dBuilder.parse(fXmlFile);

                doc.getDocumentElement().normalize();
                NodeList nList = doc.getElementsByTagName("row");

                for (int temp = 0; temp < nList.getLength(); temp++) {

                    Node nNode = nList.item(temp);
                    Element eElement = (Element) nNode;

                    Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
                    Text valueWords = new Text(eElement.getAttribute("ParentId"));
                    String val=keyWords.toString()+" "+valueWords.toString();
                    // Write the sentence
                    if(keyWords != null && valueWords != null){
                        output.collect(keyWords, new Text(val));
                    }
                }

            }catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    /**
     *
     * @throws IOException
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        /*if (args.length != 2) {
          System.err.println("Usage: wordcount <in> <out>");
          System.exit(2);
        }*/
//      FileSystem fs = FileSystem.get(conf);
        Job job = new Job(conf, "Recommend");
        job.setJarByClass(Recommend.class);

        // the keys are words (strings)
        job.setOutputKeyClass(Text.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);

        // the values are counts (ints)
        job.setOutputValueClass(Text.class);

        job.setMapperClass(Map.class);
        //conf.setReducerClass(Reduce.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
         Path outPath = new Path(args[1]);
            FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
            if (dfs.exists(outPath)) {
            dfs.delete(outPath, true);
            }
    }
}

I expect the output to be as a file in hadoop containing output as OwnerUserId ParentIdbut instead I get output as:

1599788   <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="&lt;p&gt;The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.&lt;/p&gt;&#xD;&#xA;&#xD;&#xA;&lt;p&gt;Simply writing your improved code and then giving it to them may result in your code being rejected.&lt;/p&gt;" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />

I dont know about the origin of 1599788 appearing as a key value from mapper.

I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.

Thanks in advance.

解决方案

After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.

hope it help someone and they can save their time :)

import java.io.IOException;
import java.util.StringTokenizer;

import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;

public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
    NullWritable obj;

    @Override
    public void map(LongWritable key, Text value, Context context) throws InterruptedException {
        StringTokenizer tok= new StringTokenizer(value.toString());
        String pa=null,ow=null,pi=null,v;
        while (tok.hasMoreTokens()) {
            String[] arr;
            String val = (String) tok.nextToken();
            if(val.contains("PostTypeId")){
                arr= val.split("[\"]");
                pi=arr[arr.length-1];
                if(pi.equals("2")){
                    continue;
                }
                else break;
            }
            if(val.contains("ParentId")){
                arr= val.split("[\"]");
                pa=arr[arr.length-1];
            }
            else if(val.contains("OwnerUserId") ){
                arr= val.split("[\"]");
                ow=arr[arr.length-1];
                try {
                    if(pa!=null && ow != null){
                        v=String.format("{0},{1}", ow,pa);
                        context.write(obj,new Text(v));

                    }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }


    }

}

这篇关于在hadoop上解析Stackoverflow的posts.xml的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

08-20 05:36