add infrastructure monitoring

Add Clickstack (HyperDX) as the aggregation platform.
Configure Otel Collector to collect host metrics.
This commit is contained in:
Marcel Arndt 2026-01-07 15:08:22 +01:00
parent 4eeaf483bc
commit 0496728700
13 changed files with 706 additions and 14 deletions

View File

@ -90,4 +90,19 @@
src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
fstype: ceph
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
state: mounted
state: mounted
# Metriken aktivieren
- name: CEPH | Prüfen, ob Prometheus Modul bereits aktiv ist
ansible.builtin.command: "ceph mgr module ls --format json"
register: ceph_modules_status
changed_when: false
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Prometheus Modul aktivieren
ansible.builtin.command: "ceph mgr module enable prometheus"
# Wir prüfen im JSON-Output, ob 'prometheus' in der Liste 'enabled_modules' fehlt
when: "'prometheus' not in (ceph_modules_status.stdout | from_json).enabled_modules"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,175 @@
<?xml version="1.0"?>
<clickhouse>
<logger>
<level>debug</level>
<console>true</console>
<log remove="remove" />
<errorlog remove="remove" />
</logger>
<listen_host>0.0.0.0</listen_host>
<http_port>8123</http_port>
<tcp_port>9000</tcp_port>
<interserver_http_host>ch-server</interserver_http_host>
<interserver_http_port>9009</interserver_http_port>
<max_connections>4096</max_connections>
<keep_alive_timeout>64</keep_alive_timeout>
<max_concurrent_queries>100</max_concurrent_queries>
<uncompressed_cache_size>8589934592</uncompressed_cache_size>
<mark_cache_size>5368709120</mark_cache_size>
<path>/var/lib/clickhouse/</path>
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
<user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
<user_directories>
<users_xml>
<path>users.xml</path>
</users_xml>
</user_directories>
<!-- <users_config>users.xml</users_config> -->
<default_profile>default</default_profile>
<default_database>default</default_database>
<timezone>UTC</timezone>
<mlock_executable>false</mlock_executable>
<!-- Prometheus exporter -->
<prometheus>
<endpoint>/metrics</endpoint>
<port>9363</port>
<metrics>true</metrics>
<events>true</events>
<asynchronous_metrics>true</asynchronous_metrics>
<errors>true</errors>
</prometheus>
<!-- Query log. Used only for queries with setting log_queries = 1. -->
<query_log>
<database>system</database>
<table>query_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_log>
<!-- Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected
with "collect_interval_milliseconds" interval. -->
<metric_log>
<database>system</database>
<table>metric_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
<collect_interval_milliseconds>1000</collect_interval_milliseconds>
</metric_log>
<!--
Asynchronous metric log contains values of metrics from
system.asynchronous_metrics.
-->
<asynchronous_metric_log>
<database>system</database>
<table>asynchronous_metric_log</table>
<!--
Asynchronous metrics are updated once a minute, so there is
no need to flush more often.
-->
<flush_interval_milliseconds>7000</flush_interval_milliseconds>
</asynchronous_metric_log>
<!--
OpenTelemetry log contains OpenTelemetry trace spans.
-->
<opentelemetry_span_log>
<!--
The default table creation code is insufficient, this <engine> spec
is a workaround. There is no 'event_time' for this log, but two times,
start and finish. It is sorted by finish time, to avoid inserting
data too far away in the past (probably we can sometimes insert a span
that is seconds earlier than the last span in the table, due to a race
between several spans inserted in parallel). This gives the spans a
global order that we can use to e.g. retry insertion into some external
system.
-->
<engine>
engine MergeTree
partition by toYYYYMM(finish_date)
order by (finish_date, finish_time_us, trace_id)
</engine>
<database>system</database>
<table>opentelemetry_span_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</opentelemetry_span_log>
<!-- Crash log. Stores stack traces for fatal errors.
This table is normally empty. -->
<crash_log>
<database>system</database>
<table>crash_log</table>
<partition_by />
<flush_interval_milliseconds>1000</flush_interval_milliseconds>
</crash_log>
<!-- Profiling on Processors level. -->
<processors_profile_log>
<database>system</database>
<table>processors_profile_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</processors_profile_log>
<!-- Uncomment if use part log.
Part log contains information about all actions with parts in MergeTree tables (creation, deletion,
merges, downloads).-->
<part_log>
<database>system</database>
<table>part_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</part_log>
<!-- Trace log. Stores stack traces collected by query profilers.
See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
<trace_log>
<database>system</database>
<table>trace_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</trace_log>
<!-- Query thread log. Has information about all threads participated in query execution.
Used only for queries with setting log_query_threads = 1. -->
<query_thread_log>
<database>system</database>
<table>query_thread_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_thread_log>
<!-- Query views log. Has information about all dependent views associated with a query.
Used only for queries with setting log_query_views = 1. -->
<query_views_log>
<database>system</database>
<table>query_views_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_views_log>
<remote_servers>
<hdx_cluster>
<shard>
<replica>
<host>ch-server</host>
<port>9000</port>
</replica>
</shard>
</hdx_cluster>
</remote_servers>
<distributed_ddl>
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
<format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>
</clickhouse>

View File

@ -0,0 +1,51 @@
<?xml version="1.0"?>
<clickhouse>
<profiles>
<default>
<max_memory_usage>10000000000</max_memory_usage>
<use_uncompressed_cache>0</use_uncompressed_cache>
<load_balancing>in_order</load_balancing>
<log_queries>1</log_queries>
</default>
</profiles>
<users>
<default>
<password></password>
<profile>default</profile>
<networks>
<ip>::/0</ip>
</networks>
<quota>default</quota>
</default>
<api>
<password>api</password>
<profile>default</profile>
<networks>
<ip>::/0</ip>
</networks>
<quota>default</quota>
</api>
<worker>
<password>worker</password>
<profile>default</profile>
<networks>
<ip>::/0</ip>
</networks>
<quota>default</quota>
</worker>
</users>
<quotas>
<default>
<interval>
<duration>3600</duration>
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</clickhouse>

View File

@ -0,0 +1,66 @@
---
- name: HYPERDX | Verzeichnisse erstellen
ansible.builtin.file:
path: "{{ data_dir }}/{{ item.path }}"
state: directory
owner: "{{ item.uid }}"
group: "{{ item.gid }}"
mode: '0755'
recurse: no
loop:
- { path: 'mongo', uid: 999, gid: 999 } # MongoDB Standard
- { path: 'clickhouse/data', uid: 101, gid: 101 } # ClickHouse Standard
- { path: 'clickhouse/logs', uid: 101, gid: 101 }
- { path: 'clickhouse/config', uid: 101, gid: 101 }
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: HYPERDX | ClickHouse Konfiguration kopieren
ansible.builtin.copy:
src: "{{ item }}"
dest: "/mnt/cephfs/hyperdx/clickhouse/config/"
owner: 101
group: 101
mode: '0644'
loop:
- files/config.xml # Lokal in deinem Ansible Repo
- files/users.xml
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: HYPERDX | shared-observability Netzwerk erstellen
community.docker.docker_network:
name: shared-observability
driver: overlay
state: present
attachable: yes
ipam_config:
- subnet: '172.16.116.0/24'
gateway: '172.16.116.1'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: HYPERDX | OTel Collector Config generieren
ansible.builtin.template:
src: otel-collector-config.yaml.j2
dest: "{{ data_dir }}/data/otel-collector-config.yaml"
mode: '0644'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: HYPERDX | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: '{{ data_dir }}/hyperdx.yml'
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: HYPERDX | Deploy stack
community.docker.docker_stack:
state: present
name: hyperdx
compose:
- '{{ data_dir }}/hyperdx.yml'
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,99 @@
version: '3.9'
services:
db:
image: mongo:5.0.14-focal
volumes:
- "{{ data_dir }}/mongo:/data/db"
networks:
- internal
deploy:
mode: replicated
replicas: 1
# placement:
# constraints: [node.role == worker] # DBs besser auf Workern lassen wenn möglich
otel-collector:
image: "clickhouse/clickstack-otel-collector:2"
environment:
CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s'
HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE: "default"
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
OPAMP_SERVER_URL: 'http://app:{{ hyperdx_opamp_port | default(4320) }}'
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
# - "8888:8888" # Metrics (optional)
networks:
- internal
- shared-observability
- traefik_public
deploy:
mode: replicated
replicas: 3
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik_public"
- "traefik.http.routers.otel-collector.rule=Host(`{{ otlp_domain }}`)"
- "traefik.http.routers.otel-collector.entrypoints=https"
- "traefik.http.routers.otel-collector.tls.certresolver=main"
- "traefik.http.services.otel-collector.loadbalancer.server.port=4318"
app:
image: "hyperdx/hyperdx:2"
environment:
# URLs anpassen für Traefik Erreichbarkeit
FRONTEND_URL: "https://{{ hdx_domain }}"
HYPERDX_APP_URL: "https://{{ hdx_domain }}"
HYPERDX_API_KEY: "{{ hyperdx_api_key }}"
HYPERDX_API_PORT: "{{ hyperdx_api_port | default(8000) }}"
HYPERDX_APP_PORT: "{{ hyperdx_app_port | default(8080) }}"
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
MINER_API_URL: 'http://miner:5123' # Falls miner benötigt wird (in original compose nicht definiert?)
MONGO_URI: 'mongodb://db:27017/hyperdx'
SERVER_URL: "http://127.0.0.1:{{ hyperdx_api_port | default(8000) }}"
OPAMP_PORT: "{{ hyperdx_opamp_port | default(4320) }}"
OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318'
OTEL_SERVICE_NAME: 'hdx-oss-app'
USAGE_STATS_ENABLED: "{{ usage_stats_enabled | default('false') }}"
# Clickhouse Connection String (Default User/Pass from Clickhouse Image)
DEFAULT_CONNECTIONS: >-
[{"name":"Local ClickHouse","host":"http://ch-server:8123","username":"default","password":""}]
DEFAULT_SOURCES: '{{ hyperdx_default_sources | to_json }}'
networks:
- internal
- traefik_public
deploy:
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik_public"
- "traefik.http.routers.hyperdx.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
- "traefik.http.routers.hyperdx.entrypoints=https"
- "traefik.http.routers.hyperdx.tls.certresolver=main"
- "traefik.http.services.hyperdx.loadbalancer.server.port={{ hyperdx_app_port | default(8080) }}"
ch-server:
image: clickhouse/clickhouse-server:25.6-alpine
environment:
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
volumes:
- "{{ data_dir }}/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml"
- "{{ data_dir }}/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml"
- "{{ data_dir }}/clickhouse/data:/var/lib/clickhouse"
- "{{ data_dir }}/clickhouse/logs:/var/log/clickhouse-server"
deploy:
mode: replicated
replicas: 1
# placement:
# constraints: [node.role == worker]
networks:
- internal
networks:
internal:
driver: overlay
traefik_public:
external: true
shared-observability:
external: true

View File

@ -0,0 +1,103 @@
data_dir: "{{ ceph_volume }}/hyperdx"
subdomain: "hdx"
hdx_domain: "{{ subdomain }}.{{ main_domain }}"
otlp_domain: "otlp.{{ main_domain }}"
# Generiere einen sicheren Key: `openssl rand -hex 16`
hyperdx_api_key: ""
hyperdx_api_port: 8000
hyperdx_app_port: 8080
hyperdx_log_level: "info"
hyperdx_opamp_port: 4320
usage_stats_enabled: "false"
# Definition der Datenquellen für das Frontend
hyperdx_default_sources:
- name: "Logs"
kind: "log"
from:
databaseName: "default"
tableName: "otel_logs"
timestampValueExpression: "TimestampTime"
displayedTimestampValueExpression: "Timestamp"
implicitColumnExpression: "Body"
serviceNameExpression: "ServiceName"
bodyExpression: "Body"
eventAttributesExpression: "LogAttributes"
resourceAttributesExpression: "ResourceAttributes"
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
severityTextExpression: "SeverityText"
traceIdExpression: "TraceId"
spanIdExpression: "SpanId"
connection: "Local ClickHouse"
traceSourceId: "Traces"
sessionSourceId: "Sessions"
metricSourceId: "Metrics"
- name: "Traces"
kind: "trace"
from:
databaseName: "default"
tableName: "otel_traces"
timestampValueExpression: "Timestamp"
displayedTimestampValueExpression: "Timestamp"
implicitColumnExpression: "SpanName"
serviceNameExpression: "ServiceName"
bodyExpression: "SpanName"
eventAttributesExpression: "SpanAttributes"
resourceAttributesExpression: "ResourceAttributes"
defaultTableSelectExpression: "Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName"
traceIdExpression: "TraceId"
spanIdExpression: "SpanId"
durationExpression: "Duration"
durationPrecision: 9
parentSpanIdExpression: "ParentSpanId"
spanNameExpression: "SpanName"
spanKindExpression: "SpanKind"
statusCodeExpression: "StatusCode"
statusMessageExpression: "StatusMessage"
connection: "Local ClickHouse"
logSourceId: "Logs"
sessionSourceId: "Sessions"
metricSourceId: "Metrics"
- name: "Metrics"
kind: "metric"
from:
databaseName: "default"
tableName: ""
timestampValueExpression: "TimeUnix"
resourceAttributesExpression: "ResourceAttributes"
metricTables:
gauge: "otel_metrics_gauge"
histogram: "otel_metrics_histogram"
sum: "otel_metrics_sum"
_id: "682586a8b1f81924e628e808"
id: "682586a8b1f81924e628e808"
connection: "Local ClickHouse"
logSourceId: "Logs"
traceSourceId: "Traces"
sessionSourceId: "Sessions"
- name: "Sessions"
kind: "session"
from:
databaseName: "default"
tableName: "hyperdx_sessions"
timestampValueExpression: "TimestampTime"
displayedTimestampValueExpression: "Timestamp"
implicitColumnExpression: "Body"
serviceNameExpression: "ServiceName"
bodyExpression: "Body"
eventAttributesExpression: "LogAttributes"
resourceAttributesExpression: "ResourceAttributes"
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
severityTextExpression: "SeverityText"
traceIdExpression: "TraceId"
spanIdExpression: "SpanId"
connection: "Local ClickHouse"
logSourceId: "Logs"
traceSourceId: "Traces"
metricSourceId: "Metrics"

View File

@ -0,0 +1,32 @@
- name: MONITORING | Ensure data directories
file:
path: "{{ data_dir }}/data"
state: directory
mode: '0755'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: MONITORING | Config generieren
template:
src: otel-agent-config.yaml.j2
dest: "{{ data_dir }}/otel-agent-config.yaml"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: MONITORING | Compose generieren
template:
src: docker-compose.yml.j2
dest: "{{ data_dir }}/monitoring.yml"
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: MONITORING | Stack deployen
community.docker.docker_stack:
state: present
name: infra-monitoring
compose:
- "{{ data_dir }}/monitoring.yml"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,38 @@
version: '3.9'
services:
otel-agent:
image: otel/opentelemetry-collector-contrib:0.143.0
user: "0:0" # Root für Hardware-Zugriff
command: ["--config=/etc/otel-agent-config.yaml"]
security_opt:
- apparmor:unconfined
volumes:
- {{ data_dir }}/otel-agent-config.yaml:/etc/otel-agent-config.yaml
- /:/hostfs:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- /sys:/hostfs/sys:ro
- /proc:/hostfs/proc:ro
environment:
- GOMEMLIMIT=180MiB
- HOST_PROC=/hostfs/proc
- HOST_SYS=/hostfs/sys
- HOST_ETC=/hostfs/etc
- HOST_VAR=/hostfs/var
- HOST_RUN=/hostfs/run
- HOST_DEV=/hostfs/dev
deploy:
mode: global
update_config:
parallelism: 1
delay: 10s
resources:
limits:
memory: 200M
networks:
- host
networks:
host:
name: host
external: true

View File

@ -0,0 +1,117 @@
extensions:
# Beobachtet Docker Container
docker_observer:
endpoint: "unix:///var/run/docker.sock"
cache_sync_interval: 30s
receivers:
hostmetrics:
root_path: /hostfs
collection_interval: 15s
scrapers:
cpu:
metrics:
system.cpu.time:
enabled: true
system.cpu.utilization:
enabled: true
memory:
metrics:
system.memory.usage:
enabled: true
system.memory.utilization:
enabled: true
filesystem:
metrics:
system.filesystem.usage:
enabled: true
system.filesystem.utilization:
enabled: true
paging:
metrics:
system.paging.usage:
enabled: true
system.paging.utilization:
enabled: true
system.paging.faults:
enabled: true
load:
disk:
network:
docker_stats:
endpoint: unix:///var/run/docker.sock
collection_interval: 30s
timeout: 20s
# receiver_creator:
# watch_observers: [docker_observer]
# receivers:
# filelog:
# rule: type == "container" # Nur für Container
# config:
# include:
# - /hostfs/var/lib/docker/containers/*/*.log
# operators:
# - type: container
# format: docker
# add_metadata_from_filepath: true
# - type: json_parser
# timestamp:
# parse_from: time
# layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# severity:
# parse_from: stream
# mapping:
# info: stdout
# error: stderr
# Ceph Scraping (Funktioniert nur auf Nodes, wo Ceph Mgr läuft)
prometheus:
config:
scrape_configs:
- job_name: 'ceph-local'
scrape_interval: 30s
scrape_timeout: 10s
static_configs:
- targets: ['127.0.0.1:9283']
metric_relabel_configs:
- source_labels: [__name__]
regex: 'ceph_cluster_total_.*|ceph_health_status|ceph_osd_.*|ceph_pool_.*'
action: keep
processors:
batch:
timeout: 5s
resourcedetection:
detectors: [env, system]
resourcedetection/docker:
detectors: [env, docker]
timeout: 2s
override: false
exporters:
debug:
verbosity: detailed
otlp:
endpoint: "127.0.0.1:4317"
headers:
authorization: {{ hyperdx_api_ingestion_key }}
compression: gzip
tls:
insecure: true
service:
extensions: [docker_observer]
pipelines:
metrics:
receivers: [hostmetrics, docker_stats, prometheus]
# receivers: [hostmetrics]
processors: [resourcedetection, batch]
exporters: [otlp]
# logs:
# receivers: [receiver_creator]
# processors: [resourcedetection/docker, batch]
# exporters: [otlp, debug]

View File

@ -0,0 +1 @@
data_dir: "{{ ceph_volume }}/infra-monitoring"

View File

@ -1,13 +0,0 @@
---
- name: Copy Stack Files
copy:
directory_mode: true
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
dest: /srv
- block:
- name: Deploy Monitoring stack
community.docker.docker_stack:
state: present
name: monitoring
compose:
- /srv/monitoring/observability.yml

View File

@ -59,6 +59,14 @@
interface: "{{ private_interface }}"
direction: in
- name: FIREWALL | Ceph Prometheus Exporter auf privatem Interface erlauben
community.general.ufw:
rule: allow
port: "9283"
proto: tcp
interface: "{{ private_interface }}"
direction: in
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
community.general.ufw:
rule: allow