ip.scala
package ip import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext} object ip {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
//new sc
val conf = new SparkConf ()
.setAppName ( this.getClass.getSimpleName )
.setMaster ( "local[*]" )
val ssc=new StreamingContext(conf,Seconds()) //创建topic
val topic="ip01"
val topics=Array(topic)
//创建groupid
val groupid="IPoffsets"
//创建kafka链接参数
val params=Map(
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
//告诉大家从哪里消费
"auto.offset.reset" -> "earliest",
//是否自动提交偏移量
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//创建kafka直连方式 //判断偏移量是否存在
val stream: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,params)
) stream.foreachRDD(rdd=>{
rdd.foreach(println(_))
//开启偏移量
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//去获取数据
val ip1:RDD[((String,String,String),Int)] = rdd.map ( tp => {
val splits = tp.value().split ( "[|]" )
val prive=splits()
val city = splits ( )
val fangshi=splits()
((prive,city,fangshi),)
} ).reduceByKey(_+_) //写一个方法,存储数据与偏移量信息
DateMyRedis.saveDataOffset(ip1,ranges,groupid) })
ssc.start()
ssc.awaitTermination() }
}
DataMyRedis.scala
package ip import java.util import day15.Jpoods
import org.apache.kafka.common.TopicPartition
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka010.OffsetRange import scala.collection.mutable object DateMyRedis {
//保存数据到redis
def saveDataOffset(result: RDD[((String,String, String),Int)],ranges: Array[OffsetRange],groupingID: String): Unit ={
result.foreachPartition(filter=>{
//获取jedis对象
val jedis = Jpoods.getJedis ()
//redis开启事务
val transaction = jedis.multi()
filter.foreach(tp=>{
try {
//存储数据
transaction.hincrBy("IP1", tp._1._1+":"+tp._1._2+":"+tp._1._3, tp._2)
//存储偏移量
for (o <- ranges) {
transaction.hset(groupingID, o.topic + ":" + o.partition, o.untilOffset.toString)
}
}catch {
case _ =>
println("报错了,需要回滚")
transaction.discard()
}
})
transaction.exec()
jedis.close()
}) } //从redis中获取偏移量信息
def getOffset(groupid: String, topic: String): mutable.Map[TopicPartition, Long] = {
val offset = mutable.Map [TopicPartition, Long]()
//tp._1 topic+partition tp._2 offset
import scala.collection.JavaConversions._
val jedis = Jpoods.getJedis ()
//导入转换list隐士转换
val map: util.Map[String, String] = jedis.hgetAll ( groupid )
val list = map.toList
for (o <- list) {
offset += new TopicPartition ( o._1.split ( ":" )( ), o._1.split ( ":" )( ).toInt ) -> o._2.toLong
}
offset
} }
Jpoods.scala
package day15 import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.{Jedis, JedisPool} object Jpoods {
//设置参数
private val conf=new GenericObjectPoolConfig()
conf.setMaxIdle()
conf.setMaxTotal() //获取jedis的连接对象
private val jpoods=new JedisPool("192.168.186.150",) //获取jedis对象的方法
def getJedis():Jedis={
val jedis=jpoods.getResource()
jedis.select()
jedis
} }
shell脚本 flum-kafka.conf
a1.sources = r1
a1.channels = c1 #定义source
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /usr/local/apache-flume-1.8.-bin/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /root/myde/logs/access.log #定义channel
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop01:,hadoop02:,hadoop03:
a1.channels.c1.kafka.topic = ip01
a1.channels.c1.parseAsFlumeEvent = false #将Source和channle组装在一起
a1.sources.r1.channels = c1