diff --git a/iac/ansible/roles/ceph_setup/tasks/main.yml b/iac/ansible/roles/ceph_setup/tasks/main.yml
index 91f7252..c541a4e 100644
--- a/iac/ansible/roles/ceph_setup/tasks/main.yml
+++ b/iac/ansible/roles/ceph_setup/tasks/main.yml
@@ -90,4 +90,19 @@
src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
fstype: ceph
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
- state: mounted
\ No newline at end of file
+ state: mounted
+
+# Metriken aktivieren
+- name: CEPH | Prüfen, ob Prometheus Modul bereits aktiv ist
+ ansible.builtin.command: "ceph mgr module ls --format json"
+ register: ceph_modules_status
+ changed_when: false
+ delegate_to: "{{ groups['managers'][0] }}"
+ run_once: true
+
+- name: CEPH | Prometheus Modul aktivieren
+ ansible.builtin.command: "ceph mgr module enable prometheus"
+ # Wir prüfen im JSON-Output, ob 'prometheus' in der Liste 'enabled_modules' fehlt
+ when: "'prometheus' not in (ceph_modules_status.stdout | from_json).enabled_modules"
+ delegate_to: "{{ groups['managers'][0] }}"
+ run_once: true
\ No newline at end of file
diff --git a/iac/ansible/roles/hyperdx/files/config.xml b/iac/ansible/roles/hyperdx/files/config.xml
new file mode 100644
index 0000000..84eca4b
--- /dev/null
+++ b/iac/ansible/roles/hyperdx/files/config.xml
@@ -0,0 +1,175 @@
+
+
+
+ debug
+ true
+
+
+
+
+ 0.0.0.0
+ 8123
+ 9000
+ ch-server
+ 9009
+
+ 4096
+ 64
+ 100
+ 8589934592
+ 5368709120
+
+ /var/lib/clickhouse/
+ /var/lib/clickhouse/tmp/
+ /var/lib/clickhouse/user_files/
+
+
+
+ users.xml
+
+
+
+ default
+ default
+ UTC
+ false
+
+
+
+ /metrics
+ 9363
+ true
+ true
+ true
+ true
+
+
+
+
+ system
+
+ 7500
+
+
+
+
+ system
+
+ 7500
+ 1000
+
+
+
+
+ system
+
+
+ 7000
+
+
+
+
+
+
+ engine MergeTree
+ partition by toYYYYMM(finish_date)
+ order by (finish_date, finish_time_us, trace_id)
+
+ system
+
+ 7500
+
+
+
+
+
+ system
+
+
+
+ 1000
+
+
+
+
+ system
+
+
+ toYYYYMM(event_date)
+ 7500
+
+
+
+
+ system
+
+ toYYYYMM(event_date)
+ 7500
+
+
+
+
+ system
+
+
+ toYYYYMM(event_date)
+ 7500
+
+
+
+
+ system
+
+ toYYYYMM(event_date)
+ 7500
+
+
+
+
+ system
+
+ toYYYYMM(event_date)
+ 7500
+
+
+
+
+
+
+ ch-server
+ 9000
+
+
+
+
+
+
+ /clickhouse/task_queue/ddl
+
+
+ /var/lib/clickhouse/format_schemas/
+
diff --git a/iac/ansible/roles/hyperdx/files/users.xml b/iac/ansible/roles/hyperdx/files/users.xml
new file mode 100644
index 0000000..80c8d4f
--- /dev/null
+++ b/iac/ansible/roles/hyperdx/files/users.xml
@@ -0,0 +1,51 @@
+
+
+
+
+ 10000000000
+ 0
+ in_order
+ 1
+
+
+
+
+
+
+ default
+
+ ::/0
+
+ default
+
+
+ api
+ default
+
+ ::/0
+
+ default
+
+
+ worker
+ default
+
+ ::/0
+
+ default
+
+
+
+
+
+
+ 3600
+ 0
+ 0
+ 0
+ 0
+ 0
+
+
+
+
diff --git a/iac/ansible/roles/hyperdx/tasks/main.yml b/iac/ansible/roles/hyperdx/tasks/main.yml
new file mode 100644
index 0000000..6bd89d5
--- /dev/null
+++ b/iac/ansible/roles/hyperdx/tasks/main.yml
@@ -0,0 +1,66 @@
+---
+- name: HYPERDX | Verzeichnisse erstellen
+ ansible.builtin.file:
+ path: "{{ data_dir }}/{{ item.path }}"
+ state: directory
+ owner: "{{ item.uid }}"
+ group: "{{ item.gid }}"
+ mode: '0755'
+ recurse: no
+ loop:
+ - { path: 'mongo', uid: 999, gid: 999 } # MongoDB Standard
+ - { path: 'clickhouse/data', uid: 101, gid: 101 } # ClickHouse Standard
+ - { path: 'clickhouse/logs', uid: 101, gid: 101 }
+ - { path: 'clickhouse/config', uid: 101, gid: 101 }
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: HYPERDX | ClickHouse Konfiguration kopieren
+ ansible.builtin.copy:
+ src: "{{ item }}"
+ dest: "/mnt/cephfs/hyperdx/clickhouse/config/"
+ owner: 101
+ group: 101
+ mode: '0644'
+ loop:
+ - files/config.xml # Lokal in deinem Ansible Repo
+ - files/users.xml
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: HYPERDX | shared-observability Netzwerk erstellen
+ community.docker.docker_network:
+ name: shared-observability
+ driver: overlay
+ state: present
+ attachable: yes
+ ipam_config:
+ - subnet: '172.16.116.0/24'
+ gateway: '172.16.116.1'
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: HYPERDX | OTel Collector Config generieren
+ ansible.builtin.template:
+ src: otel-collector-config.yaml.j2
+ dest: "{{ data_dir }}/data/otel-collector-config.yaml"
+ mode: '0644'
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: HYPERDX | Generate Compose file
+ ansible.builtin.template:
+ src: docker-compose.yml.j2
+ dest: '{{ data_dir }}/hyperdx.yml'
+ mode: 0644
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: HYPERDX | Deploy stack
+ community.docker.docker_stack:
+ state: present
+ name: hyperdx
+ compose:
+ - '{{ data_dir }}/hyperdx.yml'
+ delegate_to: "{{ groups['managers'][0] }}"
+ run_once: true
diff --git a/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2 b/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2
new file mode 100644
index 0000000..bbfb229
--- /dev/null
+++ b/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2
@@ -0,0 +1,99 @@
+version: '3.9'
+
+services:
+ db:
+ image: mongo:5.0.14-focal
+ volumes:
+ - "{{ data_dir }}/mongo:/data/db"
+ networks:
+ - internal
+ deploy:
+ mode: replicated
+ replicas: 1
+ # placement:
+ # constraints: [node.role == worker] # DBs besser auf Workern lassen wenn möglich
+
+ otel-collector:
+ image: "clickhouse/clickstack-otel-collector:2"
+ environment:
+ CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s'
+ HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE: "default"
+ HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
+ OPAMP_SERVER_URL: 'http://app:{{ hyperdx_opamp_port | default(4320) }}'
+ ports:
+ - "4317:4317" # OTLP gRPC
+ - "4318:4318" # OTLP HTTP
+ # - "8888:8888" # Metrics (optional)
+ networks:
+ - internal
+ - shared-observability
+ - traefik_public
+ deploy:
+ mode: replicated
+ replicas: 3
+ labels:
+ - "traefik.enable=true"
+ - "traefik.docker.network=traefik_public"
+ - "traefik.http.routers.otel-collector.rule=Host(`{{ otlp_domain }}`)"
+ - "traefik.http.routers.otel-collector.entrypoints=https"
+ - "traefik.http.routers.otel-collector.tls.certresolver=main"
+ - "traefik.http.services.otel-collector.loadbalancer.server.port=4318"
+
+ app:
+ image: "hyperdx/hyperdx:2"
+ environment:
+ # URLs anpassen für Traefik Erreichbarkeit
+ FRONTEND_URL: "https://{{ hdx_domain }}"
+ HYPERDX_APP_URL: "https://{{ hdx_domain }}"
+
+ HYPERDX_API_KEY: "{{ hyperdx_api_key }}"
+ HYPERDX_API_PORT: "{{ hyperdx_api_port | default(8000) }}"
+ HYPERDX_APP_PORT: "{{ hyperdx_app_port | default(8080) }}"
+ HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}"
+ MINER_API_URL: 'http://miner:5123' # Falls miner benötigt wird (in original compose nicht definiert?)
+ MONGO_URI: 'mongodb://db:27017/hyperdx'
+ SERVER_URL: "http://127.0.0.1:{{ hyperdx_api_port | default(8000) }}"
+ OPAMP_PORT: "{{ hyperdx_opamp_port | default(4320) }}"
+ OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318'
+ OTEL_SERVICE_NAME: 'hdx-oss-app'
+ USAGE_STATS_ENABLED: "{{ usage_stats_enabled | default('false') }}"
+ # Clickhouse Connection String (Default User/Pass from Clickhouse Image)
+ DEFAULT_CONNECTIONS: >-
+ [{"name":"Local ClickHouse","host":"http://ch-server:8123","username":"default","password":""}]
+ DEFAULT_SOURCES: '{{ hyperdx_default_sources | to_json }}'
+ networks:
+ - internal
+ - traefik_public
+ deploy:
+ labels:
+ - "traefik.enable=true"
+ - "traefik.docker.network=traefik_public"
+ - "traefik.http.routers.hyperdx.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
+ - "traefik.http.routers.hyperdx.entrypoints=https"
+ - "traefik.http.routers.hyperdx.tls.certresolver=main"
+ - "traefik.http.services.hyperdx.loadbalancer.server.port={{ hyperdx_app_port | default(8080) }}"
+
+ ch-server:
+ image: clickhouse/clickhouse-server:25.6-alpine
+ environment:
+ CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
+ volumes:
+ - "{{ data_dir }}/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml"
+ - "{{ data_dir }}/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml"
+ - "{{ data_dir }}/clickhouse/data:/var/lib/clickhouse"
+ - "{{ data_dir }}/clickhouse/logs:/var/log/clickhouse-server"
+ deploy:
+ mode: replicated
+ replicas: 1
+ # placement:
+ # constraints: [node.role == worker]
+ networks:
+ - internal
+
+networks:
+ internal:
+ driver: overlay
+ traefik_public:
+ external: true
+ shared-observability:
+ external: true
\ No newline at end of file
diff --git a/iac/ansible/roles/hyperdx/vars/main.yml b/iac/ansible/roles/hyperdx/vars/main.yml
new file mode 100644
index 0000000..bee7321
--- /dev/null
+++ b/iac/ansible/roles/hyperdx/vars/main.yml
@@ -0,0 +1,103 @@
+data_dir: "{{ ceph_volume }}/hyperdx"
+subdomain: "hdx"
+
+hdx_domain: "{{ subdomain }}.{{ main_domain }}"
+otlp_domain: "otlp.{{ main_domain }}"
+
+# Generiere einen sicheren Key: `openssl rand -hex 16`
+hyperdx_api_key: ""
+hyperdx_api_port: 8000
+hyperdx_app_port: 8080
+hyperdx_log_level: "info"
+hyperdx_opamp_port: 4320
+
+usage_stats_enabled: "false"
+
+# Definition der Datenquellen für das Frontend
+hyperdx_default_sources:
+ - name: "Logs"
+ kind: "log"
+ from:
+ databaseName: "default"
+ tableName: "otel_logs"
+ timestampValueExpression: "TimestampTime"
+ displayedTimestampValueExpression: "Timestamp"
+ implicitColumnExpression: "Body"
+ serviceNameExpression: "ServiceName"
+ bodyExpression: "Body"
+ eventAttributesExpression: "LogAttributes"
+ resourceAttributesExpression: "ResourceAttributes"
+ defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
+ severityTextExpression: "SeverityText"
+ traceIdExpression: "TraceId"
+ spanIdExpression: "SpanId"
+ connection: "Local ClickHouse"
+ traceSourceId: "Traces"
+ sessionSourceId: "Sessions"
+ metricSourceId: "Metrics"
+
+ - name: "Traces"
+ kind: "trace"
+ from:
+ databaseName: "default"
+ tableName: "otel_traces"
+ timestampValueExpression: "Timestamp"
+ displayedTimestampValueExpression: "Timestamp"
+ implicitColumnExpression: "SpanName"
+ serviceNameExpression: "ServiceName"
+ bodyExpression: "SpanName"
+ eventAttributesExpression: "SpanAttributes"
+ resourceAttributesExpression: "ResourceAttributes"
+ defaultTableSelectExpression: "Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName"
+ traceIdExpression: "TraceId"
+ spanIdExpression: "SpanId"
+ durationExpression: "Duration"
+ durationPrecision: 9
+ parentSpanIdExpression: "ParentSpanId"
+ spanNameExpression: "SpanName"
+ spanKindExpression: "SpanKind"
+ statusCodeExpression: "StatusCode"
+ statusMessageExpression: "StatusMessage"
+ connection: "Local ClickHouse"
+ logSourceId: "Logs"
+ sessionSourceId: "Sessions"
+ metricSourceId: "Metrics"
+
+ - name: "Metrics"
+ kind: "metric"
+ from:
+ databaseName: "default"
+ tableName: ""
+ timestampValueExpression: "TimeUnix"
+ resourceAttributesExpression: "ResourceAttributes"
+ metricTables:
+ gauge: "otel_metrics_gauge"
+ histogram: "otel_metrics_histogram"
+ sum: "otel_metrics_sum"
+ _id: "682586a8b1f81924e628e808"
+ id: "682586a8b1f81924e628e808"
+ connection: "Local ClickHouse"
+ logSourceId: "Logs"
+ traceSourceId: "Traces"
+ sessionSourceId: "Sessions"
+
+ - name: "Sessions"
+ kind: "session"
+ from:
+ databaseName: "default"
+ tableName: "hyperdx_sessions"
+ timestampValueExpression: "TimestampTime"
+ displayedTimestampValueExpression: "Timestamp"
+ implicitColumnExpression: "Body"
+ serviceNameExpression: "ServiceName"
+ bodyExpression: "Body"
+ eventAttributesExpression: "LogAttributes"
+ resourceAttributesExpression: "ResourceAttributes"
+ defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body"
+ severityTextExpression: "SeverityText"
+ traceIdExpression: "TraceId"
+ spanIdExpression: "SpanId"
+ connection: "Local ClickHouse"
+ logSourceId: "Logs"
+ traceSourceId: "Traces"
+ metricSourceId: "Metrics"
\ No newline at end of file
diff --git a/iac/ansible/roles/infra-monitoring/defaults/main.yml b/iac/ansible/roles/infra-monitoring/defaults/main.yml
new file mode 100644
index 0000000..e69de29
diff --git a/iac/ansible/roles/infra-monitoring/tasks/main.yml b/iac/ansible/roles/infra-monitoring/tasks/main.yml
new file mode 100644
index 0000000..17fe4d9
--- /dev/null
+++ b/iac/ansible/roles/infra-monitoring/tasks/main.yml
@@ -0,0 +1,32 @@
+
+- name: MONITORING | Ensure data directories
+ file:
+ path: "{{ data_dir }}/data"
+ state: directory
+ mode: '0755'
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: MONITORING | Config generieren
+ template:
+ src: otel-agent-config.yaml.j2
+ dest: "{{ data_dir }}/otel-agent-config.yaml"
+ delegate_to: "{{ groups['managers'][0] }}"
+ run_once: true
+
+- name: MONITORING | Compose generieren
+ template:
+ src: docker-compose.yml.j2
+ dest: "{{ data_dir }}/monitoring.yml"
+ mode: 0644
+ run_once: true
+ delegate_to: "{{ groups['managers'][0] }}"
+
+- name: MONITORING | Stack deployen
+ community.docker.docker_stack:
+ state: present
+ name: infra-monitoring
+ compose:
+ - "{{ data_dir }}/monitoring.yml"
+ delegate_to: "{{ groups['managers'][0] }}"
+ run_once: true
\ No newline at end of file
diff --git a/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2 b/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2
new file mode 100644
index 0000000..3600619
--- /dev/null
+++ b/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2
@@ -0,0 +1,38 @@
+version: '3.9'
+
+services:
+ otel-agent:
+ image: otel/opentelemetry-collector-contrib:0.143.0
+ user: "0:0" # Root für Hardware-Zugriff
+ command: ["--config=/etc/otel-agent-config.yaml"]
+ security_opt:
+ - apparmor:unconfined
+ volumes:
+ - {{ data_dir }}/otel-agent-config.yaml:/etc/otel-agent-config.yaml
+ - /:/hostfs:ro
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ - /sys:/hostfs/sys:ro
+ - /proc:/hostfs/proc:ro
+ environment:
+ - GOMEMLIMIT=180MiB
+ - HOST_PROC=/hostfs/proc
+ - HOST_SYS=/hostfs/sys
+ - HOST_ETC=/hostfs/etc
+ - HOST_VAR=/hostfs/var
+ - HOST_RUN=/hostfs/run
+ - HOST_DEV=/hostfs/dev
+ deploy:
+ mode: global
+ update_config:
+ parallelism: 1
+ delay: 10s
+ resources:
+ limits:
+ memory: 200M
+ networks:
+ - host
+
+networks:
+ host:
+ name: host
+ external: true
\ No newline at end of file
diff --git a/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2 b/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2
new file mode 100644
index 0000000..23a4fdb
--- /dev/null
+++ b/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2
@@ -0,0 +1,117 @@
+extensions:
+ # Beobachtet Docker Container
+ docker_observer:
+ endpoint: "unix:///var/run/docker.sock"
+ cache_sync_interval: 30s
+
+receivers:
+ hostmetrics:
+ root_path: /hostfs
+ collection_interval: 15s
+ scrapers:
+ cpu:
+ metrics:
+ system.cpu.time:
+ enabled: true
+ system.cpu.utilization:
+ enabled: true
+ memory:
+ metrics:
+ system.memory.usage:
+ enabled: true
+ system.memory.utilization:
+ enabled: true
+ filesystem:
+ metrics:
+ system.filesystem.usage:
+ enabled: true
+ system.filesystem.utilization:
+ enabled: true
+ paging:
+ metrics:
+ system.paging.usage:
+ enabled: true
+ system.paging.utilization:
+ enabled: true
+ system.paging.faults:
+ enabled: true
+ load:
+ disk:
+ network:
+
+ docker_stats:
+ endpoint: unix:///var/run/docker.sock
+ collection_interval: 30s
+ timeout: 20s
+
+ # receiver_creator:
+ # watch_observers: [docker_observer]
+ # receivers:
+ # filelog:
+ # rule: type == "container" # Nur für Container
+ # config:
+ # include:
+ # - /hostfs/var/lib/docker/containers/*/*.log
+ # operators:
+ # - type: container
+ # format: docker
+ # add_metadata_from_filepath: true
+ # - type: json_parser
+ # timestamp:
+ # parse_from: time
+ # layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+ # severity:
+ # parse_from: stream
+ # mapping:
+ # info: stdout
+ # error: stderr
+
+ # Ceph Scraping (Funktioniert nur auf Nodes, wo Ceph Mgr läuft)
+ prometheus:
+ config:
+ scrape_configs:
+ - job_name: 'ceph-local'
+ scrape_interval: 30s
+ scrape_timeout: 10s
+ static_configs:
+ - targets: ['127.0.0.1:9283']
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ regex: 'ceph_cluster_total_.*|ceph_health_status|ceph_osd_.*|ceph_pool_.*'
+ action: keep
+
+processors:
+ batch:
+ timeout: 5s
+
+ resourcedetection:
+ detectors: [env, system]
+
+ resourcedetection/docker:
+ detectors: [env, docker]
+ timeout: 2s
+ override: false
+
+exporters:
+ debug:
+ verbosity: detailed
+ otlp:
+ endpoint: "127.0.0.1:4317"
+ headers:
+ authorization: {{ hyperdx_api_ingestion_key }}
+ compression: gzip
+ tls:
+ insecure: true
+
+service:
+ extensions: [docker_observer]
+ pipelines:
+ metrics:
+ receivers: [hostmetrics, docker_stats, prometheus]
+ # receivers: [hostmetrics]
+ processors: [resourcedetection, batch]
+ exporters: [otlp]
+ # logs:
+ # receivers: [receiver_creator]
+ # processors: [resourcedetection/docker, batch]
+ # exporters: [otlp, debug]
\ No newline at end of file
diff --git a/iac/ansible/roles/infra-monitoring/vars/main.yml b/iac/ansible/roles/infra-monitoring/vars/main.yml
new file mode 100644
index 0000000..185c87c
--- /dev/null
+++ b/iac/ansible/roles/infra-monitoring/vars/main.yml
@@ -0,0 +1 @@
+data_dir: "{{ ceph_volume }}/infra-monitoring"
\ No newline at end of file
diff --git a/iac/ansible/roles/monitoring/tasks/main.yml b/iac/ansible/roles/monitoring/tasks/main.yml
deleted file mode 100644
index 45befbc..0000000
--- a/iac/ansible/roles/monitoring/tasks/main.yml
+++ /dev/null
@@ -1,13 +0,0 @@
----
-- name: Copy Stack Files
- copy:
- directory_mode: true
- src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
- dest: /srv
-- block:
- - name: Deploy Monitoring stack
- community.docker.docker_stack:
- state: present
- name: monitoring
- compose:
- - /srv/monitoring/observability.yml
diff --git a/iac/ansible/roles/ufw_firewall/tasks/main.yml b/iac/ansible/roles/ufw_firewall/tasks/main.yml
index e992ebb..60616f4 100644
--- a/iac/ansible/roles/ufw_firewall/tasks/main.yml
+++ b/iac/ansible/roles/ufw_firewall/tasks/main.yml
@@ -59,6 +59,14 @@
interface: "{{ private_interface }}"
direction: in
+- name: FIREWALL | Ceph Prometheus Exporter auf privatem Interface erlauben
+ community.general.ufw:
+ rule: allow
+ port: "9283"
+ proto: tcp
+ interface: "{{ private_interface }}"
+ direction: in
+
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
community.general.ufw:
rule: allow