Skip to content

telegraf panics when collecting sockets for unix protocal #16527

@Yaming-Hub

Description

@Yaming-Hub

Relevant telegraf.conf

[global_tags]
  resource_group = "my-resource-group"
  cluster = "cluster01"
  host = "..."

[agent]
  interval = "10s"
  debug = false
  hostname = "..."
  round_interval = true
  flush_interval = "30s"
  flush_jitter = "0s"
  collection_jitter = "5s"
  metric_batch_size = 1000
  metric_buffer_limit = 300000
  quiet = true
  logfile = "/var/log/telegraf/telegraf.log"
  omit_hostname = false

###############################################################################
#                                  OUTPUTS                                    #
###############################################################################

[[outputs.influxdb]]
  urls = ["udp://127.0.0.1:8089"]
  retention_policy = ""
  write_consistency = "any"
  timeout = "5s"
  udp_payload = "512B"

# # Send metrics to a file - useful for debugging
# [[outputs.file]]
#  data_format = "influx"
#  files = ["/usr/bin/ice/telegraf.out"]

###############################################################################
#                                  INPUTS                                     #
###############################################################################

[[inputs.procstat]]
  exe  = "ice"
  properties = ["cpu", "memory", "mmap", "sockets"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced", "num_fds"]
  socket_protocols = ["tcp4", "udp4", "unix"]

[[inputs.procstat]]
  exe  = "clickhouse"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "telegraf"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "mdsd"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "pwsh"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "amacoreagent"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "MetricsExtensio" # Not a typo - the process name really looks like that
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "M365Linux"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "applicationheal"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "LinuxAgent"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "azsecd"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]

[[inputs.procstat]]
  exe  = "Qualys"
  properties = ["cpu", "memory", "mmap"]
  fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]


[[inputs.clickhouse]]
   username = "ice"
   password = ""
   servers = ["http://127.0.0.1:8123"]
   auto_discovery = false
   namepass = ["clickhouse_tables","clickhouse_dictionaries ","clickhouse_events","clickhouse_metrics","clickhouse_asynchronous_metrics","clickhouse_processes"]

[[inputs.disk]]
  # Monitor the Clickhouse data directory
  mount_points = ["/raid0"]
  fieldinclude = ["total", "free", "used"]
  # Collect disk usage information (space used, free, and total space)
  ignore_fs = ["tmpfs", "devtmpfs", "devfs"]

[[inputs.mem]]
  # Monitor total memory usage
  fieldinclude = ["total", "used", "free", "available", "available_percent"]

[[inputs.cpu]]
  # Read total cpu usage
  fieldinclude = ["usage_user", "usage_system", "usage_idle", "usage_active"]

[[inputs.net]]
  # Read network metrics
  fieldinclude = ["bytes_sent", "bytes_recv", "speed", "err_in", "err_out"]

# Monitor the health of the RAID array
[[inputs.exec]]
  commands = ["/usr/bin/ice/raid_health.sh"]
  data_format = "influx"
  timeout = "5s"
  name_override = "raid_health"
  interval = "60s"

Logs from Telegraf

2025-02-19T01:34:31Z I! Loading config: /etc/telegraf/telegraf.conf
2025-02-19T01:34:44Z E! FATAL: [inputs.procstat] panicked: runtime error: slice bounds out of range [72:68], Stack:
goroutine 143 [running]:
github.com/influxdata/telegraf/agent.panicRecover(0xc000fdd380)
        /go/src/github.com/influxdata/telegraf/agent/agent.go:1196 +0x70
panic({0x9a6a920?, 0xc000935fe0?})
        /usr/local/go/src/runtime/panic.go:785 +0x132
github.com/vishvananda/netlink.(*Handle).UnixSocketDiagInfo.func1({0xc001020070, 0x44, 0xe28})
        /go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:472 +0x206
github.com/vishvananda/netlink/nl.(*NetlinkRequest).ExecuteIter(0xc001711148, 0xc00100e780?, 0x14, 0xc001711190)
        /go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/nl/nl_linux.go:627 +0x59e
github.com/vishvananda/netlink.(*Handle).UnixSocketDiagInfo(0x8e7ef80?)
        /go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:460 +0x1ee
github.com/vishvananda/netlink.UnixSocketDiagInfo(...)
        /go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:491
github.com/influxdata/telegraf/plugins/inputs/procstat.statsUnix({0xc0004ad810, 0x1, 0xc0017bda61?})
        /go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/os_linux.go:320 +0x2a5
github.com/influxdata/telegraf/plugins/inputs/procstat.(*proc).metrics(0xc000583410, {0x0?, 0xc000000b67?}, 0xc001215f40, {0x3?, 0x4d1d00?, 0x11733660?})
        /go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/process.go:337 +0x1c2e
github.com/influxdata/telegraf/plugins/inputs/procstat.(*Procstat).gatherOld(0xc001215dc0, {0xb1554a0, 0xc0017de7c0})
        /go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/procstat.go:283 +0xa2f
github.com/influxdata/telegraf/plugins/inputs/procstat.(*Procstat).Gather(0x1154d350?, {0xb1554a0?, 0xc0017de7c0?})
        /go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/procstat.go:213 +0x27
github.com/influxdata/telegraf/models.(*RunningInput).Gather(0xc000fdd380, {0xb1554a0, 0xc0017de7c0})
        /go/src/github.com/influxdata/telegraf/models/running_input.go:251 +0x251
github.com/influxdata/telegraf/agent.(*Agent).gatherOnce.func1()
        /go/src/github.com/influxdata/telegraf/agent/agent.go:583 +0x58
created by github.com/influxd
2025-02-19T01:34:44Z E! PLEASE REPORT THIS PANIC ON GITHUB with stack trace, configuration, and OS information: https://github.com/influxdata/telegraf/issues/new/choose
2025-02-19T01:34:44Z I! Loading config: /etc/telegraf/telegraf.conf

System info

telegraf 1.33.2, Ubuntu 20.04.6

Docker

No response

Steps to reproduce

Our service uses telegraf to report telemetries from Linux apps. We noticed that with the same configuration, telegraf is crashing on some Linux machines and is also running fine on other machines. After read through the code, it's the netlink.UnixSocketDiagInfo() method call panicking.

Expected behavior

The telegraf should not crash.

Actual behavior

The telegraf is keep panicking every few seconds.

Additional info

No response

Metadata

Metadata

Assignees

Labels

bugunexpected problem or unintended behaviorwaiting for responsewaiting for response from contributor

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions