add infrastructure monitoring
Add Clickstack (HyperDX) as the aggregation platform. Configure Otel Collector to collect host metrics.
This commit is contained in:
parent
4eeaf483bc
commit
0496728700
|
|
@ -90,4 +90,19 @@
|
||||||
src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
|
src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
|
||||||
fstype: ceph
|
fstype: ceph
|
||||||
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
|
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
|
||||||
state: mounted
|
state: mounted
|
||||||
|
|
||||||
|
# Metriken aktivieren
|
||||||
|
- name: CEPH | Prüfen, ob Prometheus Modul bereits aktiv ist
|
||||||
|
ansible.builtin.command: "ceph mgr module ls --format json"
|
||||||
|
register: ceph_modules_status
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: CEPH | Prometheus Modul aktivieren
|
||||||
|
ansible.builtin.command: "ceph mgr module enable prometheus"
|
||||||
|
# Wir prüfen im JSON-Output, ob 'prometheus' in der Liste 'enabled_modules' fehlt
|
||||||
|
when: "'prometheus' not in (ceph_modules_status.stdout | from_json).enabled_modules"
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
@ -0,0 +1,175 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<clickhouse>
|
||||||
|
<logger>
|
||||||
|
<level>debug</level>
|
||||||
|
<console>true</console>
|
||||||
|
<log remove="remove" />
|
||||||
|
<errorlog remove="remove" />
|
||||||
|
</logger>
|
||||||
|
|
||||||
|
<listen_host>0.0.0.0</listen_host>
|
||||||
|
<http_port>8123</http_port>
|
||||||
|
<tcp_port>9000</tcp_port>
|
||||||
|
<interserver_http_host>ch-server</interserver_http_host>
|
||||||
|
<interserver_http_port>9009</interserver_http_port>
|
||||||
|
|
||||||
|
<max_connections>4096</max_connections>
|
||||||
|
<keep_alive_timeout>64</keep_alive_timeout>
|
||||||
|
<max_concurrent_queries>100</max_concurrent_queries>
|
||||||
|
<uncompressed_cache_size>8589934592</uncompressed_cache_size>
|
||||||
|
<mark_cache_size>5368709120</mark_cache_size>
|
||||||
|
|
||||||
|
<path>/var/lib/clickhouse/</path>
|
||||||
|
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
|
||||||
|
<user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
|
||||||
|
|
||||||
|
<user_directories>
|
||||||
|
<users_xml>
|
||||||
|
<path>users.xml</path>
|
||||||
|
</users_xml>
|
||||||
|
</user_directories>
|
||||||
|
<!-- <users_config>users.xml</users_config> -->
|
||||||
|
<default_profile>default</default_profile>
|
||||||
|
<default_database>default</default_database>
|
||||||
|
<timezone>UTC</timezone>
|
||||||
|
<mlock_executable>false</mlock_executable>
|
||||||
|
|
||||||
|
<!-- Prometheus exporter -->
|
||||||
|
<prometheus>
|
||||||
|
<endpoint>/metrics</endpoint>
|
||||||
|
<port>9363</port>
|
||||||
|
<metrics>true</metrics>
|
||||||
|
<events>true</events>
|
||||||
|
<asynchronous_metrics>true</asynchronous_metrics>
|
||||||
|
<errors>true</errors>
|
||||||
|
</prometheus>
|
||||||
|
|
||||||
|
<!-- Query log. Used only for queries with setting log_queries = 1. -->
|
||||||
|
<query_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>query_log</table>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</query_log>
|
||||||
|
|
||||||
|
<!-- Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected
|
||||||
|
with "collect_interval_milliseconds" interval. -->
|
||||||
|
<metric_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>metric_log</table>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
<collect_interval_milliseconds>1000</collect_interval_milliseconds>
|
||||||
|
</metric_log>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Asynchronous metric log contains values of metrics from
|
||||||
|
system.asynchronous_metrics.
|
||||||
|
-->
|
||||||
|
<asynchronous_metric_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>asynchronous_metric_log</table>
|
||||||
|
<!--
|
||||||
|
Asynchronous metrics are updated once a minute, so there is
|
||||||
|
no need to flush more often.
|
||||||
|
-->
|
||||||
|
<flush_interval_milliseconds>7000</flush_interval_milliseconds>
|
||||||
|
</asynchronous_metric_log>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
OpenTelemetry log contains OpenTelemetry trace spans.
|
||||||
|
-->
|
||||||
|
<opentelemetry_span_log>
|
||||||
|
<!--
|
||||||
|
The default table creation code is insufficient, this <engine> spec
|
||||||
|
is a workaround. There is no 'event_time' for this log, but two times,
|
||||||
|
start and finish. It is sorted by finish time, to avoid inserting
|
||||||
|
data too far away in the past (probably we can sometimes insert a span
|
||||||
|
that is seconds earlier than the last span in the table, due to a race
|
||||||
|
between several spans inserted in parallel). This gives the spans a
|
||||||
|
global order that we can use to e.g. retry insertion into some external
|
||||||
|
system.
|
||||||
|
-->
|
||||||
|
<engine>
|
||||||
|
engine MergeTree
|
||||||
|
partition by toYYYYMM(finish_date)
|
||||||
|
order by (finish_date, finish_time_us, trace_id)
|
||||||
|
</engine>
|
||||||
|
<database>system</database>
|
||||||
|
<table>opentelemetry_span_log</table>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</opentelemetry_span_log>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Crash log. Stores stack traces for fatal errors.
|
||||||
|
This table is normally empty. -->
|
||||||
|
<crash_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>crash_log</table>
|
||||||
|
|
||||||
|
<partition_by />
|
||||||
|
<flush_interval_milliseconds>1000</flush_interval_milliseconds>
|
||||||
|
</crash_log>
|
||||||
|
|
||||||
|
<!-- Profiling on Processors level. -->
|
||||||
|
<processors_profile_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>processors_profile_log</table>
|
||||||
|
|
||||||
|
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</processors_profile_log>
|
||||||
|
|
||||||
|
<!-- Uncomment if use part log.
|
||||||
|
Part log contains information about all actions with parts in MergeTree tables (creation, deletion,
|
||||||
|
merges, downloads).-->
|
||||||
|
<part_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>part_log</table>
|
||||||
|
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</part_log>
|
||||||
|
|
||||||
|
<!-- Trace log. Stores stack traces collected by query profilers.
|
||||||
|
See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
|
||||||
|
<trace_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>trace_log</table>
|
||||||
|
|
||||||
|
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</trace_log>
|
||||||
|
|
||||||
|
<!-- Query thread log. Has information about all threads participated in query execution.
|
||||||
|
Used only for queries with setting log_query_threads = 1. -->
|
||||||
|
<query_thread_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>query_thread_log</table>
|
||||||
|
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</query_thread_log>
|
||||||
|
|
||||||
|
<!-- Query views log. Has information about all dependent views associated with a query.
|
||||||
|
Used only for queries with setting log_query_views = 1. -->
|
||||||
|
<query_views_log>
|
||||||
|
<database>system</database>
|
||||||
|
<table>query_views_log</table>
|
||||||
|
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||||
|
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||||
|
</query_views_log>
|
||||||
|
|
||||||
|
<remote_servers>
|
||||||
|
<hdx_cluster>
|
||||||
|
<shard>
|
||||||
|
<replica>
|
||||||
|
<host>ch-server</host>
|
||||||
|
<port>9000</port>
|
||||||
|
</replica>
|
||||||
|
</shard>
|
||||||
|
</hdx_cluster>
|
||||||
|
</remote_servers>
|
||||||
|
|
||||||
|
<distributed_ddl>
|
||||||
|
<path>/clickhouse/task_queue/ddl</path>
|
||||||
|
</distributed_ddl>
|
||||||
|
|
||||||
|
<format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>
|
||||||
|
</clickhouse>
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<clickhouse>
|
||||||
|
<profiles>
|
||||||
|
<default>
|
||||||
|
<max_memory_usage>10000000000</max_memory_usage>
|
||||||
|
<use_uncompressed_cache>0</use_uncompressed_cache>
|
||||||
|
<load_balancing>in_order</load_balancing>
|
||||||
|
<log_queries>1</log_queries>
|
||||||
|
</default>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<users>
|
||||||
|
<default>
|
||||||
|
<password></password>
|
||||||
|
<profile>default</profile>
|
||||||
|
<networks>
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<quota>default</quota>
|
||||||
|
</default>
|
||||||
|
<api>
|
||||||
|
<password>api</password>
|
||||||
|
<profile>default</profile>
|
||||||
|
<networks>
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<quota>default</quota>
|
||||||
|
</api>
|
||||||
|
<worker>
|
||||||
|
<password>worker</password>
|
||||||
|
<profile>default</profile>
|
||||||
|
<networks>
|
||||||
|
<ip>::/0</ip>
|
||||||
|
</networks>
|
||||||
|
<quota>default</quota>
|
||||||
|
</worker>
|
||||||
|
</users>
|
||||||
|
|
||||||
|
<quotas>
|
||||||
|
<default>
|
||||||
|
<interval>
|
||||||
|
<duration>3600</duration>
|
||||||
|
<queries>0</queries>
|
||||||
|
<errors>0</errors>
|
||||||
|
<result_rows>0</result_rows>
|
||||||
|
<read_rows>0</read_rows>
|
||||||
|
<execution_time>0</execution_time>
|
||||||
|
</interval>
|
||||||
|
</default>
|
||||||
|
</quotas>
|
||||||
|
</clickhouse>
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
---
|
||||||
|
- name: HYPERDX | Verzeichnisse erstellen
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ data_dir }}/{{ item.path }}"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ item.uid }}"
|
||||||
|
group: "{{ item.gid }}"
|
||||||
|
mode: '0755'
|
||||||
|
recurse: no
|
||||||
|
loop:
|
||||||
|
- { path: 'mongo', uid: 999, gid: 999 } # MongoDB Standard
|
||||||
|
- { path: 'clickhouse/data', uid: 101, gid: 101 } # ClickHouse Standard
|
||||||
|
- { path: 'clickhouse/logs', uid: 101, gid: 101 }
|
||||||
|
- { path: 'clickhouse/config', uid: 101, gid: 101 }
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: HYPERDX | ClickHouse Konfiguration kopieren
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: "{{ item }}"
|
||||||
|
dest: "/mnt/cephfs/hyperdx/clickhouse/config/"
|
||||||
|
owner: 101
|
||||||
|
group: 101
|
||||||
|
mode: '0644'
|
||||||
|
loop:
|
||||||
|
- files/config.xml # Lokal in deinem Ansible Repo
|
||||||
|
- files/users.xml
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: HYPERDX | shared-observability Netzwerk erstellen
|
||||||
|
community.docker.docker_network:
|
||||||
|
name: shared-observability
|
||||||
|
driver: overlay
|
||||||
|
state: present
|
||||||
|
attachable: yes
|
||||||
|
ipam_config:
|
||||||
|
- subnet: '172.16.116.0/24'
|
||||||
|
gateway: '172.16.116.1'
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: HYPERDX | OTel Collector Config generieren
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: otel-collector-config.yaml.j2
|
||||||
|
dest: "{{ data_dir }}/data/otel-collector-config.yaml"
|
||||||
|
mode: '0644'
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: HYPERDX | Generate Compose file
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: docker-compose.yml.j2
|
||||||
|
dest: '{{ data_dir }}/hyperdx.yml'
|
||||||
|
mode: 0644
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: HYPERDX | Deploy stack
|
||||||
|
community.docker.docker_stack:
|
||||||
|
state: present
|
||||||
|
name: hyperdx
|
||||||
|
compose:
|
||||||
|
- '{{ data_dir }}/hyperdx.yml'
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
@ -0,0 +1,99 @@
|
||||||
|
version: '3.9'
|
||||||
|
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: mongo:5.0.14-focal
|
||||||
|
volumes:
|
||||||
|
- "{{ data_dir }}/mongo:/data/db"
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
deploy:
|
||||||
|
mode: replicated
|
||||||
|
replicas: 1
|
||||||
|
# placement:
|
||||||
|
# constraints: [node.role == worker] # DBs besser auf Workern lassen wenn möglich
|
||||||
|
|
||||||
|
otel-collector:
|
||||||
|
image: "clickhouse/clickstack-otel-collector:2"
|
||||||
|
environment:
|
||||||
|
CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s'
|
||||||
|
HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE: "default"
|
||||||
|
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
|
||||||
|
OPAMP_SERVER_URL: 'http://app:{{ hyperdx_opamp_port | default(4320) }}'
|
||||||
|
ports:
|
||||||
|
- "4317:4317" # OTLP gRPC
|
||||||
|
- "4318:4318" # OTLP HTTP
|
||||||
|
# - "8888:8888" # Metrics (optional)
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
- shared-observability
|
||||||
|
- traefik_public
|
||||||
|
deploy:
|
||||||
|
mode: replicated
|
||||||
|
replicas: 3
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.docker.network=traefik_public"
|
||||||
|
- "traefik.http.routers.otel-collector.rule=Host(`{{ otlp_domain }}`)"
|
||||||
|
- "traefik.http.routers.otel-collector.entrypoints=https"
|
||||||
|
- "traefik.http.routers.otel-collector.tls.certresolver=main"
|
||||||
|
- "traefik.http.services.otel-collector.loadbalancer.server.port=4318"
|
||||||
|
|
||||||
|
app:
|
||||||
|
image: "hyperdx/hyperdx:2"
|
||||||
|
environment:
|
||||||
|
# URLs anpassen für Traefik Erreichbarkeit
|
||||||
|
FRONTEND_URL: "https://{{ hdx_domain }}"
|
||||||
|
HYPERDX_APP_URL: "https://{{ hdx_domain }}"
|
||||||
|
|
||||||
|
HYPERDX_API_KEY: "{{ hyperdx_api_key }}"
|
||||||
|
HYPERDX_API_PORT: "{{ hyperdx_api_port | default(8000) }}"
|
||||||
|
HYPERDX_APP_PORT: "{{ hyperdx_app_port | default(8080) }}"
|
||||||
|
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
|
||||||
|
MINER_API_URL: 'http://miner:5123' # Falls miner benötigt wird (in original compose nicht definiert?)
|
||||||
|
MONGO_URI: 'mongodb://db:27017/hyperdx'
|
||||||
|
SERVER_URL: "http://127.0.0.1:{{ hyperdx_api_port | default(8000) }}"
|
||||||
|
OPAMP_PORT: "{{ hyperdx_opamp_port | default(4320) }}"
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318'
|
||||||
|
OTEL_SERVICE_NAME: 'hdx-oss-app'
|
||||||
|
USAGE_STATS_ENABLED: "{{ usage_stats_enabled | default('false') }}"
|
||||||
|
# Clickhouse Connection String (Default User/Pass from Clickhouse Image)
|
||||||
|
DEFAULT_CONNECTIONS: >-
|
||||||
|
[{"name":"Local ClickHouse","host":"http://ch-server:8123","username":"default","password":""}]
|
||||||
|
DEFAULT_SOURCES: '{{ hyperdx_default_sources | to_json }}'
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
- traefik_public
|
||||||
|
deploy:
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.docker.network=traefik_public"
|
||||||
|
- "traefik.http.routers.hyperdx.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
|
||||||
|
- "traefik.http.routers.hyperdx.entrypoints=https"
|
||||||
|
- "traefik.http.routers.hyperdx.tls.certresolver=main"
|
||||||
|
- "traefik.http.services.hyperdx.loadbalancer.server.port={{ hyperdx_app_port | default(8080) }}"
|
||||||
|
|
||||||
|
ch-server:
|
||||||
|
image: clickhouse/clickhouse-server:25.6-alpine
|
||||||
|
environment:
|
||||||
|
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
||||||
|
volumes:
|
||||||
|
- "{{ data_dir }}/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml"
|
||||||
|
- "{{ data_dir }}/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml"
|
||||||
|
- "{{ data_dir }}/clickhouse/data:/var/lib/clickhouse"
|
||||||
|
- "{{ data_dir }}/clickhouse/logs:/var/log/clickhouse-server"
|
||||||
|
deploy:
|
||||||
|
mode: replicated
|
||||||
|
replicas: 1
|
||||||
|
# placement:
|
||||||
|
# constraints: [node.role == worker]
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
|
||||||
|
networks:
|
||||||
|
internal:
|
||||||
|
driver: overlay
|
||||||
|
traefik_public:
|
||||||
|
external: true
|
||||||
|
shared-observability:
|
||||||
|
external: true
|
||||||
|
|
@ -0,0 +1,103 @@
|
||||||
|
data_dir: "{{ ceph_volume }}/hyperdx"
|
||||||
|
subdomain: "hdx"
|
||||||
|
|
||||||
|
hdx_domain: "{{ subdomain }}.{{ main_domain }}"
|
||||||
|
otlp_domain: "otlp.{{ main_domain }}"
|
||||||
|
|
||||||
|
# Generiere einen sicheren Key: `openssl rand -hex 16`
|
||||||
|
hyperdx_api_key: ""
|
||||||
|
hyperdx_api_port: 8000
|
||||||
|
hyperdx_app_port: 8080
|
||||||
|
hyperdx_log_level: "info"
|
||||||
|
hyperdx_opamp_port: 4320
|
||||||
|
|
||||||
|
usage_stats_enabled: "false"
|
||||||
|
|
||||||
|
# Definition der Datenquellen für das Frontend
|
||||||
|
hyperdx_default_sources:
|
||||||
|
- name: "Logs"
|
||||||
|
kind: "log"
|
||||||
|
from:
|
||||||
|
databaseName: "default"
|
||||||
|
tableName: "otel_logs"
|
||||||
|
timestampValueExpression: "TimestampTime"
|
||||||
|
displayedTimestampValueExpression: "Timestamp"
|
||||||
|
implicitColumnExpression: "Body"
|
||||||
|
serviceNameExpression: "ServiceName"
|
||||||
|
bodyExpression: "Body"
|
||||||
|
eventAttributesExpression: "LogAttributes"
|
||||||
|
resourceAttributesExpression: "ResourceAttributes"
|
||||||
|
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
|
||||||
|
severityTextExpression: "SeverityText"
|
||||||
|
traceIdExpression: "TraceId"
|
||||||
|
spanIdExpression: "SpanId"
|
||||||
|
connection: "Local ClickHouse"
|
||||||
|
traceSourceId: "Traces"
|
||||||
|
sessionSourceId: "Sessions"
|
||||||
|
metricSourceId: "Metrics"
|
||||||
|
|
||||||
|
- name: "Traces"
|
||||||
|
kind: "trace"
|
||||||
|
from:
|
||||||
|
databaseName: "default"
|
||||||
|
tableName: "otel_traces"
|
||||||
|
timestampValueExpression: "Timestamp"
|
||||||
|
displayedTimestampValueExpression: "Timestamp"
|
||||||
|
implicitColumnExpression: "SpanName"
|
||||||
|
serviceNameExpression: "ServiceName"
|
||||||
|
bodyExpression: "SpanName"
|
||||||
|
eventAttributesExpression: "SpanAttributes"
|
||||||
|
resourceAttributesExpression: "ResourceAttributes"
|
||||||
|
defaultTableSelectExpression: "Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName"
|
||||||
|
traceIdExpression: "TraceId"
|
||||||
|
spanIdExpression: "SpanId"
|
||||||
|
durationExpression: "Duration"
|
||||||
|
durationPrecision: 9
|
||||||
|
parentSpanIdExpression: "ParentSpanId"
|
||||||
|
spanNameExpression: "SpanName"
|
||||||
|
spanKindExpression: "SpanKind"
|
||||||
|
statusCodeExpression: "StatusCode"
|
||||||
|
statusMessageExpression: "StatusMessage"
|
||||||
|
connection: "Local ClickHouse"
|
||||||
|
logSourceId: "Logs"
|
||||||
|
sessionSourceId: "Sessions"
|
||||||
|
metricSourceId: "Metrics"
|
||||||
|
|
||||||
|
- name: "Metrics"
|
||||||
|
kind: "metric"
|
||||||
|
from:
|
||||||
|
databaseName: "default"
|
||||||
|
tableName: ""
|
||||||
|
timestampValueExpression: "TimeUnix"
|
||||||
|
resourceAttributesExpression: "ResourceAttributes"
|
||||||
|
metricTables:
|
||||||
|
gauge: "otel_metrics_gauge"
|
||||||
|
histogram: "otel_metrics_histogram"
|
||||||
|
sum: "otel_metrics_sum"
|
||||||
|
_id: "682586a8b1f81924e628e808"
|
||||||
|
id: "682586a8b1f81924e628e808"
|
||||||
|
connection: "Local ClickHouse"
|
||||||
|
logSourceId: "Logs"
|
||||||
|
traceSourceId: "Traces"
|
||||||
|
sessionSourceId: "Sessions"
|
||||||
|
|
||||||
|
- name: "Sessions"
|
||||||
|
kind: "session"
|
||||||
|
from:
|
||||||
|
databaseName: "default"
|
||||||
|
tableName: "hyperdx_sessions"
|
||||||
|
timestampValueExpression: "TimestampTime"
|
||||||
|
displayedTimestampValueExpression: "Timestamp"
|
||||||
|
implicitColumnExpression: "Body"
|
||||||
|
serviceNameExpression: "ServiceName"
|
||||||
|
bodyExpression: "Body"
|
||||||
|
eventAttributesExpression: "LogAttributes"
|
||||||
|
resourceAttributesExpression: "ResourceAttributes"
|
||||||
|
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
|
||||||
|
severityTextExpression: "SeverityText"
|
||||||
|
traceIdExpression: "TraceId"
|
||||||
|
spanIdExpression: "SpanId"
|
||||||
|
connection: "Local ClickHouse"
|
||||||
|
logSourceId: "Logs"
|
||||||
|
traceSourceId: "Traces"
|
||||||
|
metricSourceId: "Metrics"
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
- name: MONITORING | Ensure data directories
|
||||||
|
file:
|
||||||
|
path: "{{ data_dir }}/data"
|
||||||
|
state: directory
|
||||||
|
mode: '0755'
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: MONITORING | Config generieren
|
||||||
|
template:
|
||||||
|
src: otel-agent-config.yaml.j2
|
||||||
|
dest: "{{ data_dir }}/otel-agent-config.yaml"
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: MONITORING | Compose generieren
|
||||||
|
template:
|
||||||
|
src: docker-compose.yml.j2
|
||||||
|
dest: "{{ data_dir }}/monitoring.yml"
|
||||||
|
mode: 0644
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
|
||||||
|
- name: MONITORING | Stack deployen
|
||||||
|
community.docker.docker_stack:
|
||||||
|
state: present
|
||||||
|
name: infra-monitoring
|
||||||
|
compose:
|
||||||
|
- "{{ data_dir }}/monitoring.yml"
|
||||||
|
delegate_to: "{{ groups['managers'][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
version: '3.9'
|
||||||
|
|
||||||
|
services:
|
||||||
|
otel-agent:
|
||||||
|
image: otel/opentelemetry-collector-contrib:0.143.0
|
||||||
|
user: "0:0" # Root für Hardware-Zugriff
|
||||||
|
command: ["--config=/etc/otel-agent-config.yaml"]
|
||||||
|
security_opt:
|
||||||
|
- apparmor:unconfined
|
||||||
|
volumes:
|
||||||
|
- {{ data_dir }}/otel-agent-config.yaml:/etc/otel-agent-config.yaml
|
||||||
|
- /:/hostfs:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
- /sys:/hostfs/sys:ro
|
||||||
|
- /proc:/hostfs/proc:ro
|
||||||
|
environment:
|
||||||
|
- GOMEMLIMIT=180MiB
|
||||||
|
- HOST_PROC=/hostfs/proc
|
||||||
|
- HOST_SYS=/hostfs/sys
|
||||||
|
- HOST_ETC=/hostfs/etc
|
||||||
|
- HOST_VAR=/hostfs/var
|
||||||
|
- HOST_RUN=/hostfs/run
|
||||||
|
- HOST_DEV=/hostfs/dev
|
||||||
|
deploy:
|
||||||
|
mode: global
|
||||||
|
update_config:
|
||||||
|
parallelism: 1
|
||||||
|
delay: 10s
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 200M
|
||||||
|
networks:
|
||||||
|
- host
|
||||||
|
|
||||||
|
networks:
|
||||||
|
host:
|
||||||
|
name: host
|
||||||
|
external: true
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
extensions:
|
||||||
|
# Beobachtet Docker Container
|
||||||
|
docker_observer:
|
||||||
|
endpoint: "unix:///var/run/docker.sock"
|
||||||
|
cache_sync_interval: 30s
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
hostmetrics:
|
||||||
|
root_path: /hostfs
|
||||||
|
collection_interval: 15s
|
||||||
|
scrapers:
|
||||||
|
cpu:
|
||||||
|
metrics:
|
||||||
|
system.cpu.time:
|
||||||
|
enabled: true
|
||||||
|
system.cpu.utilization:
|
||||||
|
enabled: true
|
||||||
|
memory:
|
||||||
|
metrics:
|
||||||
|
system.memory.usage:
|
||||||
|
enabled: true
|
||||||
|
system.memory.utilization:
|
||||||
|
enabled: true
|
||||||
|
filesystem:
|
||||||
|
metrics:
|
||||||
|
system.filesystem.usage:
|
||||||
|
enabled: true
|
||||||
|
system.filesystem.utilization:
|
||||||
|
enabled: true
|
||||||
|
paging:
|
||||||
|
metrics:
|
||||||
|
system.paging.usage:
|
||||||
|
enabled: true
|
||||||
|
system.paging.utilization:
|
||||||
|
enabled: true
|
||||||
|
system.paging.faults:
|
||||||
|
enabled: true
|
||||||
|
load:
|
||||||
|
disk:
|
||||||
|
network:
|
||||||
|
|
||||||
|
docker_stats:
|
||||||
|
endpoint: unix:///var/run/docker.sock
|
||||||
|
collection_interval: 30s
|
||||||
|
timeout: 20s
|
||||||
|
|
||||||
|
# receiver_creator:
|
||||||
|
# watch_observers: [docker_observer]
|
||||||
|
# receivers:
|
||||||
|
# filelog:
|
||||||
|
# rule: type == "container" # Nur für Container
|
||||||
|
# config:
|
||||||
|
# include:
|
||||||
|
# - /hostfs/var/lib/docker/containers/*/*.log
|
||||||
|
# operators:
|
||||||
|
# - type: container
|
||||||
|
# format: docker
|
||||||
|
# add_metadata_from_filepath: true
|
||||||
|
# - type: json_parser
|
||||||
|
# timestamp:
|
||||||
|
# parse_from: time
|
||||||
|
# layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
# severity:
|
||||||
|
# parse_from: stream
|
||||||
|
# mapping:
|
||||||
|
# info: stdout
|
||||||
|
# error: stderr
|
||||||
|
|
||||||
|
# Ceph Scraping (Funktioniert nur auf Nodes, wo Ceph Mgr läuft)
|
||||||
|
prometheus:
|
||||||
|
config:
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'ceph-local'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 10s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['127.0.0.1:9283']
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'ceph_cluster_total_.*|ceph_health_status|ceph_osd_.*|ceph_pool_.*'
|
||||||
|
action: keep
|
||||||
|
|
||||||
|
processors:
|
||||||
|
batch:
|
||||||
|
timeout: 5s
|
||||||
|
|
||||||
|
resourcedetection:
|
||||||
|
detectors: [env, system]
|
||||||
|
|
||||||
|
resourcedetection/docker:
|
||||||
|
detectors: [env, docker]
|
||||||
|
timeout: 2s
|
||||||
|
override: false
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
debug:
|
||||||
|
verbosity: detailed
|
||||||
|
otlp:
|
||||||
|
endpoint: "127.0.0.1:4317"
|
||||||
|
headers:
|
||||||
|
authorization: {{ hyperdx_api_ingestion_key }}
|
||||||
|
compression: gzip
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
|
|
||||||
|
service:
|
||||||
|
extensions: [docker_observer]
|
||||||
|
pipelines:
|
||||||
|
metrics:
|
||||||
|
receivers: [hostmetrics, docker_stats, prometheus]
|
||||||
|
# receivers: [hostmetrics]
|
||||||
|
processors: [resourcedetection, batch]
|
||||||
|
exporters: [otlp]
|
||||||
|
# logs:
|
||||||
|
# receivers: [receiver_creator]
|
||||||
|
# processors: [resourcedetection/docker, batch]
|
||||||
|
# exporters: [otlp, debug]
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
data_dir: "{{ ceph_volume }}/infra-monitoring"
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
---
|
|
||||||
- name: Copy Stack Files
|
|
||||||
copy:
|
|
||||||
directory_mode: true
|
|
||||||
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
|
|
||||||
dest: /srv
|
|
||||||
- block:
|
|
||||||
- name: Deploy Monitoring stack
|
|
||||||
community.docker.docker_stack:
|
|
||||||
state: present
|
|
||||||
name: monitoring
|
|
||||||
compose:
|
|
||||||
- /srv/monitoring/observability.yml
|
|
||||||
|
|
@ -59,6 +59,14 @@
|
||||||
interface: "{{ private_interface }}"
|
interface: "{{ private_interface }}"
|
||||||
direction: in
|
direction: in
|
||||||
|
|
||||||
|
- name: FIREWALL | Ceph Prometheus Exporter auf privatem Interface erlauben
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "9283"
|
||||||
|
proto: tcp
|
||||||
|
interface: "{{ private_interface }}"
|
||||||
|
direction: in
|
||||||
|
|
||||||
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
|
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
|
||||||
community.general.ufw:
|
community.general.ufw:
|
||||||
rule: allow
|
rule: allow
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue