add infrastructure monitoring
Add Clickstack (HyperDX) as the aggregation platform. Configure Otel Collector to collect host metrics.
This commit is contained in:
parent
4eeaf483bc
commit
0496728700
|
|
@ -91,3 +91,18 @@
|
|||
fstype: ceph
|
||||
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
|
||||
state: mounted
|
||||
|
||||
# Metriken aktivieren
|
||||
- name: CEPH | Prüfen, ob Prometheus Modul bereits aktiv ist
|
||||
ansible.builtin.command: "ceph mgr module ls --format json"
|
||||
register: ceph_modules_status
|
||||
changed_when: false
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: CEPH | Prometheus Modul aktivieren
|
||||
ansible.builtin.command: "ceph mgr module enable prometheus"
|
||||
# Wir prüfen im JSON-Output, ob 'prometheus' in der Liste 'enabled_modules' fehlt
|
||||
when: "'prometheus' not in (ceph_modules_status.stdout | from_json).enabled_modules"
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
run_once: true
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
<?xml version="1.0"?>
|
||||
<clickhouse>
|
||||
<logger>
|
||||
<level>debug</level>
|
||||
<console>true</console>
|
||||
<log remove="remove" />
|
||||
<errorlog remove="remove" />
|
||||
</logger>
|
||||
|
||||
<listen_host>0.0.0.0</listen_host>
|
||||
<http_port>8123</http_port>
|
||||
<tcp_port>9000</tcp_port>
|
||||
<interserver_http_host>ch-server</interserver_http_host>
|
||||
<interserver_http_port>9009</interserver_http_port>
|
||||
|
||||
<max_connections>4096</max_connections>
|
||||
<keep_alive_timeout>64</keep_alive_timeout>
|
||||
<max_concurrent_queries>100</max_concurrent_queries>
|
||||
<uncompressed_cache_size>8589934592</uncompressed_cache_size>
|
||||
<mark_cache_size>5368709120</mark_cache_size>
|
||||
|
||||
<path>/var/lib/clickhouse/</path>
|
||||
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
|
||||
<user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
|
||||
|
||||
<user_directories>
|
||||
<users_xml>
|
||||
<path>users.xml</path>
|
||||
</users_xml>
|
||||
</user_directories>
|
||||
<!-- <users_config>users.xml</users_config> -->
|
||||
<default_profile>default</default_profile>
|
||||
<default_database>default</default_database>
|
||||
<timezone>UTC</timezone>
|
||||
<mlock_executable>false</mlock_executable>
|
||||
|
||||
<!-- Prometheus exporter -->
|
||||
<prometheus>
|
||||
<endpoint>/metrics</endpoint>
|
||||
<port>9363</port>
|
||||
<metrics>true</metrics>
|
||||
<events>true</events>
|
||||
<asynchronous_metrics>true</asynchronous_metrics>
|
||||
<errors>true</errors>
|
||||
</prometheus>
|
||||
|
||||
<!-- Query log. Used only for queries with setting log_queries = 1. -->
|
||||
<query_log>
|
||||
<database>system</database>
|
||||
<table>query_log</table>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</query_log>
|
||||
|
||||
<!-- Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected
|
||||
with "collect_interval_milliseconds" interval. -->
|
||||
<metric_log>
|
||||
<database>system</database>
|
||||
<table>metric_log</table>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
<collect_interval_milliseconds>1000</collect_interval_milliseconds>
|
||||
</metric_log>
|
||||
|
||||
<!--
|
||||
Asynchronous metric log contains values of metrics from
|
||||
system.asynchronous_metrics.
|
||||
-->
|
||||
<asynchronous_metric_log>
|
||||
<database>system</database>
|
||||
<table>asynchronous_metric_log</table>
|
||||
<!--
|
||||
Asynchronous metrics are updated once a minute, so there is
|
||||
no need to flush more often.
|
||||
-->
|
||||
<flush_interval_milliseconds>7000</flush_interval_milliseconds>
|
||||
</asynchronous_metric_log>
|
||||
|
||||
<!--
|
||||
OpenTelemetry log contains OpenTelemetry trace spans.
|
||||
-->
|
||||
<opentelemetry_span_log>
|
||||
<!--
|
||||
The default table creation code is insufficient, this <engine> spec
|
||||
is a workaround. There is no 'event_time' for this log, but two times,
|
||||
start and finish. It is sorted by finish time, to avoid inserting
|
||||
data too far away in the past (probably we can sometimes insert a span
|
||||
that is seconds earlier than the last span in the table, due to a race
|
||||
between several spans inserted in parallel). This gives the spans a
|
||||
global order that we can use to e.g. retry insertion into some external
|
||||
system.
|
||||
-->
|
||||
<engine>
|
||||
engine MergeTree
|
||||
partition by toYYYYMM(finish_date)
|
||||
order by (finish_date, finish_time_us, trace_id)
|
||||
</engine>
|
||||
<database>system</database>
|
||||
<table>opentelemetry_span_log</table>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</opentelemetry_span_log>
|
||||
|
||||
|
||||
<!-- Crash log. Stores stack traces for fatal errors.
|
||||
This table is normally empty. -->
|
||||
<crash_log>
|
||||
<database>system</database>
|
||||
<table>crash_log</table>
|
||||
|
||||
<partition_by />
|
||||
<flush_interval_milliseconds>1000</flush_interval_milliseconds>
|
||||
</crash_log>
|
||||
|
||||
<!-- Profiling on Processors level. -->
|
||||
<processors_profile_log>
|
||||
<database>system</database>
|
||||
<table>processors_profile_log</table>
|
||||
|
||||
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</processors_profile_log>
|
||||
|
||||
<!-- Uncomment if use part log.
|
||||
Part log contains information about all actions with parts in MergeTree tables (creation, deletion,
|
||||
merges, downloads).-->
|
||||
<part_log>
|
||||
<database>system</database>
|
||||
<table>part_log</table>
|
||||
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</part_log>
|
||||
|
||||
<!-- Trace log. Stores stack traces collected by query profilers.
|
||||
See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
|
||||
<trace_log>
|
||||
<database>system</database>
|
||||
<table>trace_log</table>
|
||||
|
||||
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</trace_log>
|
||||
|
||||
<!-- Query thread log. Has information about all threads participated in query execution.
|
||||
Used only for queries with setting log_query_threads = 1. -->
|
||||
<query_thread_log>
|
||||
<database>system</database>
|
||||
<table>query_thread_log</table>
|
||||
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</query_thread_log>
|
||||
|
||||
<!-- Query views log. Has information about all dependent views associated with a query.
|
||||
Used only for queries with setting log_query_views = 1. -->
|
||||
<query_views_log>
|
||||
<database>system</database>
|
||||
<table>query_views_log</table>
|
||||
<partition_by>toYYYYMM(event_date)</partition_by>
|
||||
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
||||
</query_views_log>
|
||||
|
||||
<remote_servers>
|
||||
<hdx_cluster>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>ch-server</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
</hdx_cluster>
|
||||
</remote_servers>
|
||||
|
||||
<distributed_ddl>
|
||||
<path>/clickhouse/task_queue/ddl</path>
|
||||
</distributed_ddl>
|
||||
|
||||
<format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>
|
||||
</clickhouse>
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
<?xml version="1.0"?>
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<max_memory_usage>10000000000</max_memory_usage>
|
||||
<use_uncompressed_cache>0</use_uncompressed_cache>
|
||||
<load_balancing>in_order</load_balancing>
|
||||
<log_queries>1</log_queries>
|
||||
</default>
|
||||
</profiles>
|
||||
|
||||
<users>
|
||||
<default>
|
||||
<password></password>
|
||||
<profile>default</profile>
|
||||
<networks>
|
||||
<ip>::/0</ip>
|
||||
</networks>
|
||||
<quota>default</quota>
|
||||
</default>
|
||||
<api>
|
||||
<password>api</password>
|
||||
<profile>default</profile>
|
||||
<networks>
|
||||
<ip>::/0</ip>
|
||||
</networks>
|
||||
<quota>default</quota>
|
||||
</api>
|
||||
<worker>
|
||||
<password>worker</password>
|
||||
<profile>default</profile>
|
||||
<networks>
|
||||
<ip>::/0</ip>
|
||||
</networks>
|
||||
<quota>default</quota>
|
||||
</worker>
|
||||
</users>
|
||||
|
||||
<quotas>
|
||||
<default>
|
||||
<interval>
|
||||
<duration>3600</duration>
|
||||
<queries>0</queries>
|
||||
<errors>0</errors>
|
||||
<result_rows>0</result_rows>
|
||||
<read_rows>0</read_rows>
|
||||
<execution_time>0</execution_time>
|
||||
</interval>
|
||||
</default>
|
||||
</quotas>
|
||||
</clickhouse>
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
---
|
||||
- name: HYPERDX | Verzeichnisse erstellen
|
||||
ansible.builtin.file:
|
||||
path: "{{ data_dir }}/{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.uid }}"
|
||||
group: "{{ item.gid }}"
|
||||
mode: '0755'
|
||||
recurse: no
|
||||
loop:
|
||||
- { path: 'mongo', uid: 999, gid: 999 } # MongoDB Standard
|
||||
- { path: 'clickhouse/data', uid: 101, gid: 101 } # ClickHouse Standard
|
||||
- { path: 'clickhouse/logs', uid: 101, gid: 101 }
|
||||
- { path: 'clickhouse/config', uid: 101, gid: 101 }
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: HYPERDX | ClickHouse Konfiguration kopieren
|
||||
ansible.builtin.copy:
|
||||
src: "{{ item }}"
|
||||
dest: "/mnt/cephfs/hyperdx/clickhouse/config/"
|
||||
owner: 101
|
||||
group: 101
|
||||
mode: '0644'
|
||||
loop:
|
||||
- files/config.xml # Lokal in deinem Ansible Repo
|
||||
- files/users.xml
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: HYPERDX | shared-observability Netzwerk erstellen
|
||||
community.docker.docker_network:
|
||||
name: shared-observability
|
||||
driver: overlay
|
||||
state: present
|
||||
attachable: yes
|
||||
ipam_config:
|
||||
- subnet: '172.16.116.0/24'
|
||||
gateway: '172.16.116.1'
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: HYPERDX | OTel Collector Config generieren
|
||||
ansible.builtin.template:
|
||||
src: otel-collector-config.yaml.j2
|
||||
dest: "{{ data_dir }}/data/otel-collector-config.yaml"
|
||||
mode: '0644'
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: HYPERDX | Generate Compose file
|
||||
ansible.builtin.template:
|
||||
src: docker-compose.yml.j2
|
||||
dest: '{{ data_dir }}/hyperdx.yml'
|
||||
mode: 0644
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: HYPERDX | Deploy stack
|
||||
community.docker.docker_stack:
|
||||
state: present
|
||||
name: hyperdx
|
||||
compose:
|
||||
- '{{ data_dir }}/hyperdx.yml'
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
run_once: true
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
version: '3.9'
|
||||
|
||||
services:
|
||||
db:
|
||||
image: mongo:5.0.14-focal
|
||||
volumes:
|
||||
- "{{ data_dir }}/mongo:/data/db"
|
||||
networks:
|
||||
- internal
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
# placement:
|
||||
# constraints: [node.role == worker] # DBs besser auf Workern lassen wenn möglich
|
||||
|
||||
otel-collector:
|
||||
image: "clickhouse/clickstack-otel-collector:2"
|
||||
environment:
|
||||
CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s'
|
||||
HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE: "default"
|
||||
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
|
||||
OPAMP_SERVER_URL: 'http://app:{{ hyperdx_opamp_port | default(4320) }}'
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC
|
||||
- "4318:4318" # OTLP HTTP
|
||||
# - "8888:8888" # Metrics (optional)
|
||||
networks:
|
||||
- internal
|
||||
- shared-observability
|
||||
- traefik_public
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 3
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=traefik_public"
|
||||
- "traefik.http.routers.otel-collector.rule=Host(`{{ otlp_domain }}`)"
|
||||
- "traefik.http.routers.otel-collector.entrypoints=https"
|
||||
- "traefik.http.routers.otel-collector.tls.certresolver=main"
|
||||
- "traefik.http.services.otel-collector.loadbalancer.server.port=4318"
|
||||
|
||||
app:
|
||||
image: "hyperdx/hyperdx:2"
|
||||
environment:
|
||||
# URLs anpassen für Traefik Erreichbarkeit
|
||||
FRONTEND_URL: "https://{{ hdx_domain }}"
|
||||
HYPERDX_APP_URL: "https://{{ hdx_domain }}"
|
||||
|
||||
HYPERDX_API_KEY: "{{ hyperdx_api_key }}"
|
||||
HYPERDX_API_PORT: "{{ hyperdx_api_port | default(8000) }}"
|
||||
HYPERDX_APP_PORT: "{{ hyperdx_app_port | default(8080) }}"
|
||||
HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
|
||||
MINER_API_URL: 'http://miner:5123' # Falls miner benötigt wird (in original compose nicht definiert?)
|
||||
MONGO_URI: 'mongodb://db:27017/hyperdx'
|
||||
SERVER_URL: "http://127.0.0.1:{{ hyperdx_api_port | default(8000) }}"
|
||||
OPAMP_PORT: "{{ hyperdx_opamp_port | default(4320) }}"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318'
|
||||
OTEL_SERVICE_NAME: 'hdx-oss-app'
|
||||
USAGE_STATS_ENABLED: "{{ usage_stats_enabled | default('false') }}"
|
||||
# Clickhouse Connection String (Default User/Pass from Clickhouse Image)
|
||||
DEFAULT_CONNECTIONS: >-
|
||||
[{"name":"Local ClickHouse","host":"http://ch-server:8123","username":"default","password":""}]
|
||||
DEFAULT_SOURCES: '{{ hyperdx_default_sources | to_json }}'
|
||||
networks:
|
||||
- internal
|
||||
- traefik_public
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=traefik_public"
|
||||
- "traefik.http.routers.hyperdx.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
|
||||
- "traefik.http.routers.hyperdx.entrypoints=https"
|
||||
- "traefik.http.routers.hyperdx.tls.certresolver=main"
|
||||
- "traefik.http.services.hyperdx.loadbalancer.server.port={{ hyperdx_app_port | default(8080) }}"
|
||||
|
||||
ch-server:
|
||||
image: clickhouse/clickhouse-server:25.6-alpine
|
||||
environment:
|
||||
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
|
||||
volumes:
|
||||
- "{{ data_dir }}/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml"
|
||||
- "{{ data_dir }}/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml"
|
||||
- "{{ data_dir }}/clickhouse/data:/var/lib/clickhouse"
|
||||
- "{{ data_dir }}/clickhouse/logs:/var/log/clickhouse-server"
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
# placement:
|
||||
# constraints: [node.role == worker]
|
||||
networks:
|
||||
- internal
|
||||
|
||||
networks:
|
||||
internal:
|
||||
driver: overlay
|
||||
traefik_public:
|
||||
external: true
|
||||
shared-observability:
|
||||
external: true
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
data_dir: "{{ ceph_volume }}/hyperdx"
|
||||
subdomain: "hdx"
|
||||
|
||||
hdx_domain: "{{ subdomain }}.{{ main_domain }}"
|
||||
otlp_domain: "otlp.{{ main_domain }}"
|
||||
|
||||
# Generiere einen sicheren Key: `openssl rand -hex 16`
|
||||
hyperdx_api_key: ""
|
||||
hyperdx_api_port: 8000
|
||||
hyperdx_app_port: 8080
|
||||
hyperdx_log_level: "info"
|
||||
hyperdx_opamp_port: 4320
|
||||
|
||||
usage_stats_enabled: "false"
|
||||
|
||||
# Definition der Datenquellen für das Frontend
|
||||
hyperdx_default_sources:
|
||||
- name: "Logs"
|
||||
kind: "log"
|
||||
from:
|
||||
databaseName: "default"
|
||||
tableName: "otel_logs"
|
||||
timestampValueExpression: "TimestampTime"
|
||||
displayedTimestampValueExpression: "Timestamp"
|
||||
implicitColumnExpression: "Body"
|
||||
serviceNameExpression: "ServiceName"
|
||||
bodyExpression: "Body"
|
||||
eventAttributesExpression: "LogAttributes"
|
||||
resourceAttributesExpression: "ResourceAttributes"
|
||||
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
|
||||
severityTextExpression: "SeverityText"
|
||||
traceIdExpression: "TraceId"
|
||||
spanIdExpression: "SpanId"
|
||||
connection: "Local ClickHouse"
|
||||
traceSourceId: "Traces"
|
||||
sessionSourceId: "Sessions"
|
||||
metricSourceId: "Metrics"
|
||||
|
||||
- name: "Traces"
|
||||
kind: "trace"
|
||||
from:
|
||||
databaseName: "default"
|
||||
tableName: "otel_traces"
|
||||
timestampValueExpression: "Timestamp"
|
||||
displayedTimestampValueExpression: "Timestamp"
|
||||
implicitColumnExpression: "SpanName"
|
||||
serviceNameExpression: "ServiceName"
|
||||
bodyExpression: "SpanName"
|
||||
eventAttributesExpression: "SpanAttributes"
|
||||
resourceAttributesExpression: "ResourceAttributes"
|
||||
defaultTableSelectExpression: "Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName"
|
||||
traceIdExpression: "TraceId"
|
||||
spanIdExpression: "SpanId"
|
||||
durationExpression: "Duration"
|
||||
durationPrecision: 9
|
||||
parentSpanIdExpression: "ParentSpanId"
|
||||
spanNameExpression: "SpanName"
|
||||
spanKindExpression: "SpanKind"
|
||||
statusCodeExpression: "StatusCode"
|
||||
statusMessageExpression: "StatusMessage"
|
||||
connection: "Local ClickHouse"
|
||||
logSourceId: "Logs"
|
||||
sessionSourceId: "Sessions"
|
||||
metricSourceId: "Metrics"
|
||||
|
||||
- name: "Metrics"
|
||||
kind: "metric"
|
||||
from:
|
||||
databaseName: "default"
|
||||
tableName: ""
|
||||
timestampValueExpression: "TimeUnix"
|
||||
resourceAttributesExpression: "ResourceAttributes"
|
||||
metricTables:
|
||||
gauge: "otel_metrics_gauge"
|
||||
histogram: "otel_metrics_histogram"
|
||||
sum: "otel_metrics_sum"
|
||||
_id: "682586a8b1f81924e628e808"
|
||||
id: "682586a8b1f81924e628e808"
|
||||
connection: "Local ClickHouse"
|
||||
logSourceId: "Logs"
|
||||
traceSourceId: "Traces"
|
||||
sessionSourceId: "Sessions"
|
||||
|
||||
- name: "Sessions"
|
||||
kind: "session"
|
||||
from:
|
||||
databaseName: "default"
|
||||
tableName: "hyperdx_sessions"
|
||||
timestampValueExpression: "TimestampTime"
|
||||
displayedTimestampValueExpression: "Timestamp"
|
||||
implicitColumnExpression: "Body"
|
||||
serviceNameExpression: "ServiceName"
|
||||
bodyExpression: "Body"
|
||||
eventAttributesExpression: "LogAttributes"
|
||||
resourceAttributesExpression: "ResourceAttributes"
|
||||
defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
|
||||
severityTextExpression: "SeverityText"
|
||||
traceIdExpression: "TraceId"
|
||||
spanIdExpression: "SpanId"
|
||||
connection: "Local ClickHouse"
|
||||
logSourceId: "Logs"
|
||||
traceSourceId: "Traces"
|
||||
metricSourceId: "Metrics"
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
- name: MONITORING | Ensure data directories
|
||||
file:
|
||||
path: "{{ data_dir }}/data"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: MONITORING | Config generieren
|
||||
template:
|
||||
src: otel-agent-config.yaml.j2
|
||||
dest: "{{ data_dir }}/otel-agent-config.yaml"
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: MONITORING | Compose generieren
|
||||
template:
|
||||
src: docker-compose.yml.j2
|
||||
dest: "{{ data_dir }}/monitoring.yml"
|
||||
mode: 0644
|
||||
run_once: true
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
|
||||
- name: MONITORING | Stack deployen
|
||||
community.docker.docker_stack:
|
||||
state: present
|
||||
name: infra-monitoring
|
||||
compose:
|
||||
- "{{ data_dir }}/monitoring.yml"
|
||||
delegate_to: "{{ groups['managers'][0] }}"
|
||||
run_once: true
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
version: '3.9'
|
||||
|
||||
services:
|
||||
otel-agent:
|
||||
image: otel/opentelemetry-collector-contrib:0.143.0
|
||||
user: "0:0" # Root für Hardware-Zugriff
|
||||
command: ["--config=/etc/otel-agent-config.yaml"]
|
||||
security_opt:
|
||||
- apparmor:unconfined
|
||||
volumes:
|
||||
- {{ data_dir }}/otel-agent-config.yaml:/etc/otel-agent-config.yaml
|
||||
- /:/hostfs:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /sys:/hostfs/sys:ro
|
||||
- /proc:/hostfs/proc:ro
|
||||
environment:
|
||||
- GOMEMLIMIT=180MiB
|
||||
- HOST_PROC=/hostfs/proc
|
||||
- HOST_SYS=/hostfs/sys
|
||||
- HOST_ETC=/hostfs/etc
|
||||
- HOST_VAR=/hostfs/var
|
||||
- HOST_RUN=/hostfs/run
|
||||
- HOST_DEV=/hostfs/dev
|
||||
deploy:
|
||||
mode: global
|
||||
update_config:
|
||||
parallelism: 1
|
||||
delay: 10s
|
||||
resources:
|
||||
limits:
|
||||
memory: 200M
|
||||
networks:
|
||||
- host
|
||||
|
||||
networks:
|
||||
host:
|
||||
name: host
|
||||
external: true
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
extensions:
|
||||
# Beobachtet Docker Container
|
||||
docker_observer:
|
||||
endpoint: "unix:///var/run/docker.sock"
|
||||
cache_sync_interval: 30s
|
||||
|
||||
receivers:
|
||||
hostmetrics:
|
||||
root_path: /hostfs
|
||||
collection_interval: 15s
|
||||
scrapers:
|
||||
cpu:
|
||||
metrics:
|
||||
system.cpu.time:
|
||||
enabled: true
|
||||
system.cpu.utilization:
|
||||
enabled: true
|
||||
memory:
|
||||
metrics:
|
||||
system.memory.usage:
|
||||
enabled: true
|
||||
system.memory.utilization:
|
||||
enabled: true
|
||||
filesystem:
|
||||
metrics:
|
||||
system.filesystem.usage:
|
||||
enabled: true
|
||||
system.filesystem.utilization:
|
||||
enabled: true
|
||||
paging:
|
||||
metrics:
|
||||
system.paging.usage:
|
||||
enabled: true
|
||||
system.paging.utilization:
|
||||
enabled: true
|
||||
system.paging.faults:
|
||||
enabled: true
|
||||
load:
|
||||
disk:
|
||||
network:
|
||||
|
||||
docker_stats:
|
||||
endpoint: unix:///var/run/docker.sock
|
||||
collection_interval: 30s
|
||||
timeout: 20s
|
||||
|
||||
# receiver_creator:
|
||||
# watch_observers: [docker_observer]
|
||||
# receivers:
|
||||
# filelog:
|
||||
# rule: type == "container" # Nur für Container
|
||||
# config:
|
||||
# include:
|
||||
# - /hostfs/var/lib/docker/containers/*/*.log
|
||||
# operators:
|
||||
# - type: container
|
||||
# format: docker
|
||||
# add_metadata_from_filepath: true
|
||||
# - type: json_parser
|
||||
# timestamp:
|
||||
# parse_from: time
|
||||
# layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||
# severity:
|
||||
# parse_from: stream
|
||||
# mapping:
|
||||
# info: stdout
|
||||
# error: stderr
|
||||
|
||||
# Ceph Scraping (Funktioniert nur auf Nodes, wo Ceph Mgr läuft)
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'ceph-local'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
static_configs:
|
||||
- targets: ['127.0.0.1:9283']
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'ceph_cluster_total_.*|ceph_health_status|ceph_osd_.*|ceph_pool_.*'
|
||||
action: keep
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 5s
|
||||
|
||||
resourcedetection:
|
||||
detectors: [env, system]
|
||||
|
||||
resourcedetection/docker:
|
||||
detectors: [env, docker]
|
||||
timeout: 2s
|
||||
override: false
|
||||
|
||||
exporters:
|
||||
debug:
|
||||
verbosity: detailed
|
||||
otlp:
|
||||
endpoint: "127.0.0.1:4317"
|
||||
headers:
|
||||
authorization: {{ hyperdx_api_ingestion_key }}
|
||||
compression: gzip
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
service:
|
||||
extensions: [docker_observer]
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [hostmetrics, docker_stats, prometheus]
|
||||
# receivers: [hostmetrics]
|
||||
processors: [resourcedetection, batch]
|
||||
exporters: [otlp]
|
||||
# logs:
|
||||
# receivers: [receiver_creator]
|
||||
# processors: [resourcedetection/docker, batch]
|
||||
# exporters: [otlp, debug]
|
||||
|
|
@ -0,0 +1 @@
|
|||
data_dir: "{{ ceph_volume }}/infra-monitoring"
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
---
|
||||
- name: Copy Stack Files
|
||||
copy:
|
||||
directory_mode: true
|
||||
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
|
||||
dest: /srv
|
||||
- block:
|
||||
- name: Deploy Monitoring stack
|
||||
community.docker.docker_stack:
|
||||
state: present
|
||||
name: monitoring
|
||||
compose:
|
||||
- /srv/monitoring/observability.yml
|
||||
|
|
@ -59,6 +59,14 @@
|
|||
interface: "{{ private_interface }}"
|
||||
direction: in
|
||||
|
||||
- name: FIREWALL | Ceph Prometheus Exporter auf privatem Interface erlauben
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "9283"
|
||||
proto: tcp
|
||||
interface: "{{ private_interface }}"
|
||||
direction: in
|
||||
|
||||
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
|
|
|
|||
Loading…
Reference in New Issue