streaming消费kafka手动维护offset到redis

🕗 发布于 2024-11-20 20:29 kafka redis 分布式 spark

1.redis工具类


```scala
package com.qupojie.kafka_offset

import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
import java.util
import scala.collection.JavaConverters.mapAsScalaMapConverter
import scala.collection.mutable

object RedisUtils {

  private val config = new JedisPoolConfig
  private val redisHost = "hadoop110"
  private val redisPort = 6379
  config.setMaxTotal(30)
  config.setMaxIdle(10)
  private val pool = new JedisPool(config, redisHost, redisPort, 10000)
  private val topicPrefix = "kafka:topic"
  private val offsetMap: mutable.Map[TopicPartition, Long] = mutable.Map()
  private def getKey(topic: String, groupId: String, prefix: String = topicPrefix): String = s"$prefix:$topic:$groupId"
  private def getRedisConnection: Jedis = pool.getResource

  //获取redis offset
  def getOffsetsFromRedis(topics: Array[String], groupId: String): mutable.Map[TopicPartition, Long] = {
    val jedis: Jedis = getRedisConnection
    topics.foreach(topic => {
      jedis.select(1)
      val resultoffsetMap: util.Map[String, String] = jedis.hgetAll(getKey(topic, groupId))
      //关闭流
      jedis.close()
      //判断
      if (resultoffsetMap.size() == 0) {
        //如果没用读到redis数据就给初始化offset
        offsetMap.put(new TopicPartition(topic, 0), 0L)
        offsetMap.put(new TopicPartition(topic, 1), 0L)
        offsetMap.put(new TopicPartition(topic, 2), 0L)
        offsetMap
      } else {
        resultoffsetMap.asScala
          .foreach(offset => {
            //如果读到redis数据就封装map返回
            offsetMap.put(new TopicPartition(topic, offset._1.toInt), offset._2.toLong)
          })

      }
    })
    offsetMap
  }

  def saveOffsetsToRedis(ranges: Array[OffsetRange], groupId: String): Unit = {
    val jedis: Jedis = getRedisConnection
    jedis.select(1)
    val offsetList: Map[String, Array[(String, (Int, Long))]] = ranges
      .map(range => (range.topic, range.partition -> range.untilOffset))
      .groupBy(_._1)
    offsetList.map {
      case (topic, buffer) => (topic, buffer.map(_._2))
    }.foreach {
      case (topic, partitionAndOffset) =>
        val offsets: Array[(String, String)] = partitionAndOffset.map(elem => (elem._1.toString, elem._2.toString))
        import scala.collection.JavaConverters._
        jedis.hmset(getKey(topic, groupId), offsets.toMap.asJava)
    }
    jedis.close()
  }
}
```scala

2.spark streaming实现类

```scala
package com.qupojie.kafka_offset

import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Assign
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}

import scala.collection.mutable

object SparkConsumerKafka01 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SparkConsumerKafka01").setMaster("local[*]")
    val sc: StreamingContext = new StreamingContext(conf = conf, batchDuration = Durations.seconds(5))
    val kafkaParams: mutable.HashMap[String, String] = new mutable.HashMap[String, String]()
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "hadoop110:9092,hadoop112:9092,hadoop112:9092");
    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
    kafkaParams.put("spark.streaming.kafka.maxRatePerPartition", "10");
    kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
    var groupId = "SparkConsumerKafka01"
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); // 最新的位置
    val dbIndex = 1
    val topics = Array("spark_test02")
    val fromOffsets: mutable.Map[TopicPartition, Long] = RedisUtils.getOffsetsFromRedis(topics, "SparkConsumerKafka01")
    val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      sc,
      PreferConsistent,
      Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
    )
    stream.foreachRDD(rdd => {
      println("--------当前时间"+System.nanoTime()+"--------")
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(iter => {
        val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
        println(s"${o.topic} 分区： ${o.partition} 起始偏移量：${o.fromOffset} 结束偏移量： ${o.untilOffset}")
        RedisUtils.saveOffsetsToRedis(offsetRanges, groupId)
      })
    })
    sc.start()
    sc.awaitTermination()
  }
}

```scala

3.运行图

在这里插入图片描述

原文地址：https://blog.csdn.net/qq_42890382/article/details/143870681

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：Qt 文件管理
下一篇：【单片机基础】中断系统的作用与配置

论文阅读——Intrusion detection systems using longshort‑term memory (LSTM)
作者提出的 LSTM 模型能够有效区分正常网络流量和攻击流量。除此之外，模型结合主成分分析（PCA）和互信息作为降维方法。实验结果表明，基于 PCA 的模型（特别是使用2个主成分）在二分类和多分类任务
阅读更多2024-11-21
用源码编译虚幻引擎，并打包到安卓平台
本文详细介绍了如何用源码编译虚幻引擎，并将其打包到安卓平台。
阅读更多2024-11-21
Vue项目开发 element-UI 前端实现 1到10排列选择的按钮
在 Element UI 中，你可以通过来实现按钮的排列选择，例如让用户选择 1 到 10 之间的数字。为了实现这一功能，我们可以使用来动态生成 1 到 10 的按钮，并通过按钮点击事件来更新
阅读更多2024-11-21
Java EE 【知识改变命运】01计算机的一些知识点
计算机一些基础知识
阅读更多2024-11-21
04 —— Webpack打包CSS代码
加载器style-loader：把解析后的css代码插入到DOM。加载器css-loader ：解析css代码。直接引用，不用变量接收。
阅读更多2024-11-21
万能程序补丁工具 C# 源代码详解
万能程序补丁工具程序目的：搜索二进制可执行 EXE 或 DLL 文件分析的特征代码，替换特征代码，达到调试修正目标程序的功能。
阅读更多2024-11-21
【MySQL数据库】C#实现MySQL数据库最简单的查询和执行函数
C#和MySQL数据库是常见的数据交互，标准的查询和执行方法如下，做个记录。
阅读更多2024-11-21
单条推理转批量推理prompt
在每个线程中设置环境变量 CUDA_VISIBLE_DEVICES，以确保每个线程只使用指定的GPU。使用 concurrent.futures.ThreadPoolExecutor 来管理多线程任务
阅读更多2024-11-21
【AIGC】ChatGPT提示词Prompt解析：情感分析，分手后还可以做朋友吗？
【AIGC】在情感博弈中，最重要的是保持清醒的认知和优雅的态度。识别控制话术不是为了对抗，而是为了更好地保护自己的情感自由，实现真正的成长。
阅读更多2024-11-21
pycharm中配置pyqt5
PyQt和wxPython则提供了更多的控件和更强大的功能，适合于需要复杂用户界面的应用程序。pyQt生成的应用程序，引用图片通常是将资源文件装换为 python 文件，然后引用资源文件，而不能直接加
阅读更多2024-11-21

streaming消费kafka手动维护offset到redis

1.redis工具类

2.spark streaming实现类

3.运行图

相关文章