-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Closed
Labels
bugunexpected problem or unintended behaviorunexpected problem or unintended behaviorwaiting for responsewaiting for response from contributorwaiting for response from contributor
Description
Relevant telegraf.conf
[global_tags]
resource_group = "my-resource-group"
cluster = "cluster01"
host = "..."
[agent]
interval = "10s"
debug = false
hostname = "..."
round_interval = true
flush_interval = "30s"
flush_jitter = "0s"
collection_jitter = "5s"
metric_batch_size = 1000
metric_buffer_limit = 300000
quiet = true
logfile = "/var/log/telegraf/telegraf.log"
omit_hostname = false
###############################################################################
# OUTPUTS #
###############################################################################
[[outputs.influxdb]]
urls = ["udp://127.0.0.1:8089"]
retention_policy = ""
write_consistency = "any"
timeout = "5s"
udp_payload = "512B"
# # Send metrics to a file - useful for debugging
# [[outputs.file]]
# data_format = "influx"
# files = ["/usr/bin/ice/telegraf.out"]
###############################################################################
# INPUTS #
###############################################################################
[[inputs.procstat]]
exe = "ice"
properties = ["cpu", "memory", "mmap", "sockets"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced", "num_fds"]
socket_protocols = ["tcp4", "udp4", "unix"]
[[inputs.procstat]]
exe = "clickhouse"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "telegraf"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "mdsd"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "pwsh"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "amacoreagent"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "MetricsExtensio" # Not a typo - the process name really looks like that
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "M365Linux"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "applicationheal"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "LinuxAgent"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "azsecd"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.procstat]]
exe = "Qualys"
properties = ["cpu", "memory", "mmap"]
fieldinclude = ["cpu_usage","memory_usage","memory_rss", "memory_swap", "memory_vms", "memory_referenced"]
[[inputs.clickhouse]]
username = "ice"
password = ""
servers = ["http://127.0.0.1:8123"]
auto_discovery = false
namepass = ["clickhouse_tables","clickhouse_dictionaries ","clickhouse_events","clickhouse_metrics","clickhouse_asynchronous_metrics","clickhouse_processes"]
[[inputs.disk]]
# Monitor the Clickhouse data directory
mount_points = ["/raid0"]
fieldinclude = ["total", "free", "used"]
# Collect disk usage information (space used, free, and total space)
ignore_fs = ["tmpfs", "devtmpfs", "devfs"]
[[inputs.mem]]
# Monitor total memory usage
fieldinclude = ["total", "used", "free", "available", "available_percent"]
[[inputs.cpu]]
# Read total cpu usage
fieldinclude = ["usage_user", "usage_system", "usage_idle", "usage_active"]
[[inputs.net]]
# Read network metrics
fieldinclude = ["bytes_sent", "bytes_recv", "speed", "err_in", "err_out"]
# Monitor the health of the RAID array
[[inputs.exec]]
commands = ["/usr/bin/ice/raid_health.sh"]
data_format = "influx"
timeout = "5s"
name_override = "raid_health"
interval = "60s"
Logs from Telegraf
2025-02-19T01:34:31Z I! Loading config: /etc/telegraf/telegraf.conf
2025-02-19T01:34:44Z E! FATAL: [inputs.procstat] panicked: runtime error: slice bounds out of range [72:68], Stack:
goroutine 143 [running]:
github.com/influxdata/telegraf/agent.panicRecover(0xc000fdd380)
/go/src/github.com/influxdata/telegraf/agent/agent.go:1196 +0x70
panic({0x9a6a920?, 0xc000935fe0?})
/usr/local/go/src/runtime/panic.go:785 +0x132
github.com/vishvananda/netlink.(*Handle).UnixSocketDiagInfo.func1({0xc001020070, 0x44, 0xe28})
/go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:472 +0x206
github.com/vishvananda/netlink/nl.(*NetlinkRequest).ExecuteIter(0xc001711148, 0xc00100e780?, 0x14, 0xc001711190)
/go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/nl/nl_linux.go:627 +0x59e
github.com/vishvananda/netlink.(*Handle).UnixSocketDiagInfo(0x8e7ef80?)
/go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:460 +0x1ee
github.com/vishvananda/netlink.UnixSocketDiagInfo(...)
/go/pkg/mod/github.com/vishvananda/netlink@v1.3.0/socket_linux.go:491
github.com/influxdata/telegraf/plugins/inputs/procstat.statsUnix({0xc0004ad810, 0x1, 0xc0017bda61?})
/go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/os_linux.go:320 +0x2a5
github.com/influxdata/telegraf/plugins/inputs/procstat.(*proc).metrics(0xc000583410, {0x0?, 0xc000000b67?}, 0xc001215f40, {0x3?, 0x4d1d00?, 0x11733660?})
/go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/process.go:337 +0x1c2e
github.com/influxdata/telegraf/plugins/inputs/procstat.(*Procstat).gatherOld(0xc001215dc0, {0xb1554a0, 0xc0017de7c0})
/go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/procstat.go:283 +0xa2f
github.com/influxdata/telegraf/plugins/inputs/procstat.(*Procstat).Gather(0x1154d350?, {0xb1554a0?, 0xc0017de7c0?})
/go/src/github.com/influxdata/telegraf/plugins/inputs/procstat/procstat.go:213 +0x27
github.com/influxdata/telegraf/models.(*RunningInput).Gather(0xc000fdd380, {0xb1554a0, 0xc0017de7c0})
/go/src/github.com/influxdata/telegraf/models/running_input.go:251 +0x251
github.com/influxdata/telegraf/agent.(*Agent).gatherOnce.func1()
/go/src/github.com/influxdata/telegraf/agent/agent.go:583 +0x58
created by github.com/influxd
2025-02-19T01:34:44Z E! PLEASE REPORT THIS PANIC ON GITHUB with stack trace, configuration, and OS information: https://github.com/influxdata/telegraf/issues/new/choose
2025-02-19T01:34:44Z I! Loading config: /etc/telegraf/telegraf.conf
System info
telegraf 1.33.2, Ubuntu 20.04.6
Docker
No response
Steps to reproduce
Our service uses telegraf to report telemetries from Linux apps. We noticed that with the same configuration, telegraf is crashing on some Linux machines and is also running fine on other machines. After read through the code, it's the netlink.UnixSocketDiagInfo() method call panicking.
Expected behavior
The telegraf should not crash.
Actual behavior
The telegraf is keep panicking every few seconds.
Additional info
No response
Metadata
Metadata
Assignees
Labels
bugunexpected problem or unintended behaviorunexpected problem or unintended behaviorwaiting for responsewaiting for response from contributorwaiting for response from contributor