Spark Strcutured Streaming中使用Dataset的groupBy agg 与 join 示例(java api)

Dataset的groupBy agg示例

Dataset<Row> resultDs = dsParsed

.groupBy("enodeb_id", "ecell_id")

.agg(

    functions.first("scan_start_time").alias("scan_start_time1"),

    functions.first("insert_time").alias("insert_time1"),

    functions.first("mr_type").alias("mr_type1"),

    functions.first("mr_ltescphr").alias("mr_ltescphr1"),

    functions.first("mr_ltescpuschprbnum").alias("mr_ltescpuschprbnum1"),

    functions.count("enodeb_id").alias("rows1"))

.selectExpr(

    "ecell_id",

    "enodeb_id",

    "scan_start_time1 as scan_start_time",

    "insert_time1 as insert_time",

    "mr_type1 as mr_type",

    "mr_ltescphr1 as mr_ltescphr",

    "mr_ltescpuschprbnum1 as mr_ltescpuschprbnum",

    "rows1 as rows");

Dataset Join示例：

        Dataset<Row> ncRes = sparkSession.read().option("delimiter", "|").option("header", true).csv("/user/csv");

        Dataset<Row> mro=sparkSession.sql("。。。");

        Dataset<Row> ncJoinMro = ncRes

                .join(mro, mro.col("id").equalTo(ncRes.col("id")).and(mro.col("calid").equalTo(ncRes.col("calid"))), "left_outer")

                .select(ncRes.col("id").as("int_id"),

                        mro.col("vendor_id"),

                         。。。
                 );

join condition另外一种方式:

leftDfWithWatermark.join(rightDfWithWatermark, 
　　expr(""" leftDfId = rightDfId AND leftDfTime >= rightDfTime AND leftDfTime <= rightDfTime + interval 1 hour"""), 
　　joinType = "leftOuter" )

BroadcastHashJoin示例：

package com.dx.testbroadcast;

import org.apache.spark.SparkConf;

import org.apache.spark.sql.Dataset;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.SparkSession;

import org.apache.spark.sql.functions;

import java.io.*;

public class Test {

    public static void main(String[] args) {

        String personPath = "E:\\person.csv";

        String personOrderPath = "E:\\personOrder.csv";

        //writeToPersion(personPath);

        //writeToPersionOrder(personOrderPath);

        SparkConf conf = new SparkConf();

        SparkSession sparkSession = SparkSession.builder().config(conf).appName("test-broadcast-app").master("local[*]").getOrCreate();

        Dataset<Row> person = sparkSession.read()

                .option("header", "true")

                .option("inferSchema", "true") //是否自动推到内容的类型

                .option("delimiter", ",").csv(personPath).as("person");

        person.printSchema();

        Dataset<Row> personOrder = sparkSession.read()

                .option("header", "true")

                .option("inferSchema", "true") //是否自动推到内容的类型

                .option("delimiter", ",").csv(personOrderPath).as("personOrder");

        personOrder.printSchema();

        // Default `inner`. Must be one of:`inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,`right`, `right_outer`, `left_semi`, `left_anti`.

        Dataset<Row> resultDs = personOrder.join(functions.broadcast(person), personOrder.col("personid").equalTo(person.col("id")),"left");

        resultDs.explain();
        resultDs.show(10);

    }

    private static void writeToPersion(String personPath) {

        BufferedWriter personWriter = null;

        try {

            personWriter = new BufferedWriter(new FileWriter(personPath));

            personWriter.write("id,name,age,address\r\n");

            for (int i = ; i < ; i++) {

                personWriter.write("" + i + ",person-" + i + "," + i + ",address-address-address-address-address-address-address" + i + "\r\n");

            }

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            if (personWriter != null) {

                try {

                    personWriter.close();

                } catch (IOException e) {

                    e.printStackTrace();

                }

            }

        }

    }

    private static void writeToPersionOrder(String personOrderPath) {

        BufferedWriter personWriter = null;

        try {

            personWriter = new BufferedWriter(new FileWriter(personOrderPath));

            personWriter.write("personid,name,age,address\r\n");

            for (int i = ; i < ; i++) {

                personWriter.write("" + i + ",person-" + i + "," + i + ",address-address-address-address-address-address-address" + i + "\r\n");

            }

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            if (personWriter != null) {

                try {

                    personWriter.close();

                } catch (IOException e) {

                    e.printStackTrace();

                }

            }

        }

    }

}

打印结果：

== Physical Plan ==

*() BroadcastHashJoin [personid#], [id#], LeftOuter, BuildRight

:- *() FileScan csv [personid#,name#,age#,address#] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/personOrder.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<personid:int,name:string,age:int,address:string>

+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[, int, true] as bigint)))

   +- *() Project [id#, name#, age#, address#]

      +- *() Filter isnotnull(id#)

         +- *() FileScan csv [id#,name#,age#,address#] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/person.csv], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:int,name:string,age:int,address:string>

+--------+--------+---+--------------------+---+--------+---+--------------------+

|personid|    name|age|             address| id|    name|age|             address|

+--------+--------+---+--------------------+---+--------+---+--------------------+

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

|       |person-|  |address-address-a...|  |person-|  |address-address-a...|

+--------+--------+---+--------------------+---+--------+---+--------------------+

only showing top  rows

SparkSQL Broadcast HashJoin

        person.createOrReplaceTempView("temp_person");

        personOrder.createOrReplaceTempView("temp_person_order");

        Dataset<Row> sqlResult = sparkSession.sql(

                " SELECT /*+ BROADCAST (t11) */" +

                " t11.id,t11.name,t11.age,t11.address," +

                " t10.personid as person_id,t10.name as persion_order_name" +

                " FROM temp_person_order as t10 " +

                " inner join temp_person as t11" +

                " on t11.id = t10.personid ");

        sqlResult.show();

        sqlResult.explain();

打印日志

+---+--------+---+--------------------+---------+------------------+

| id|    name|age|             address|person_id|persion_order_name|

+---+--------+---+--------------------+---------+------------------+

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

|  |person-|  |address-address-a...|        |          person-|

+---+--------+---+--------------------+---------+------------------+

only showing top  rows

// :: INFO FileSourceStrategy: Pruning directories with:

// :: INFO FileSourceStrategy: Post-Scan Filters: isnotnull(personid#)

// :: INFO FileSourceStrategy: Output Data Schema: struct<personid: int, name: string>

// :: INFO FileSourceScanExec: Pushed Filters: IsNotNull(personid)

// :: INFO FileSourceStrategy: Pruning directories with:

// :: INFO FileSourceStrategy: Post-Scan Filters: isnotnull(id#)

// :: INFO FileSourceStrategy: Output Data Schema: struct<id: int, name: string, age: int, address: string ...  more fields>

// :: INFO FileSourceScanExec: Pushed Filters: IsNotNull(id)

== Physical Plan ==

*() Project [id#, name#, age#, address#, personid# AS person_id#, name# AS persion_order_name#]

+- *() BroadcastHashJoin [personid#], [id#], Inner, BuildRight

   :- *() Project [personid#, name#]

   :  +- *() Filter isnotnull(personid#)

   :     +- *() FileScan csv [personid#,name#] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/personOrder.csv], PartitionFilters: [], PushedFilters: [IsNotNull(personid)], ReadSchema: struct<personid:int,name:string>

   +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[, int, true] as bigint)))

      +- *() Project [id#, name#, age#, address#]

         +- *() Filter isnotnull(id#)

            +- *() FileScan csv [id#,name#,age#,address#] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/person.csv], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:int,name:string,age:int,address:string>

// :: INFO SparkContext: Invoking stop() from shutdown hook