add plattform services

2026-01-05 16:01:40 +01:00 · 2026-01-05 16:01:40 +01:00 · 193319fa52
parent e25d8dd5d9
commit 193319fa52
65 changed files with 4240 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+.local/
+
+leantime/
+
+
+.vscode/
+
+.DS_Store
--- a/Readme.md
+++ b/Readme.md
@ -0,0 +1,35 @@
+  ██████╗ ███████╗███╗   ██╗██╗██╗  ██╗ ██████╗     ██████╗ ███████╗ ██████╗ 
+ ██╔════╝ ██╔════╝████╗  ██║██║██║  ██║██╔════╝    ██╔════╝ ██╔════╝██╔═══██╗
+ ██║  ███╗█████╗  ██╔██╗ ██║██║██║  ██║╚█████╗     ██║      █████╗  ██║   ██║
+ ██║   ██║██╔══╝  ██║╚██╗██║██║██║  ██║ ╚═══██╗    ██║      ██╔══╝  ██║   ██║
+ ╚██████╔╝███████╗██║ ╚████║██║╚█████╔╝██████╔╝    ╚██████╗ ███████╗╚██████╔╝
+  ╚═════╝ ╚══════╝╚═╝  ╚═══╝╚═╝ ╚════╝ ╚═════╝  ██  ╚═════╝ ╚══════╝ ╚═════╝  
+---
+
+# Genius.ceo repository
+
+Ceph Dashboard is now available at:
+
+             URL: https://manager-node-1:8443/
+            User: admin
+        Password: g0uhtgv520
+
+Enabling client.admin keyring and conf on hosts with "admin" label
+Saving cluster configuration to /var/lib/ceph/6fb9d55b-5b20-11f0-9be6-920006043bcc/config directory
+You can access the Ceph CLI as following in case of multi-cluster or non-default config:
+
+        sudo /usr/sbin/cephadm shell --fsid 6fb9d55b-5b20-11f0-9be6-920006043bcc -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring
+
+Or, if you are only running a single cluster on this host:
+
+        sudo /usr/sbin/cephadm shell 
+
+Please consider enabling telemetry to help improve Ceph:
+
+        ceph telemetry on
+
+For more information see:
+
+        https://docs.ceph.com/en/latest/mgr/telemetry/
+
+Bootstrap complete.
--- a/iac/ansible/01-infra.playbook.yml
+++ b/iac/ansible/01-infra.playbook.yml
@ -0,0 +1,33 @@
+- name: Nodes initialisieren und härten
+  hosts: all
+  become: true
+
+  roles:
+    - role: common
+      tags: common
+    - role: ssh_hardening
+      tags: ssh
+    - role: ufw_firewall
+      tags: firewall
+    - role: fail2ban
+      tags: fail2ban
+  handlers:
+    - name: restart sshd
+      ansible.builtin.service:
+        name: ssh
+        state: restarted
+    - name: restart fail2ban
+      ansible.builtin.service:
+        name: fail2ban
+        state: restarted
+- name: Setup Ceph Cluster and CephFS
+  hosts: all
+  become: true
+  roles:
+    - role: ceph_setup
+
+- name: Docker Swarm initialisieren
+  hosts: all
+  become: true
+  roles:
+    - role: docker_swarm
--- a/iac/ansible/02-plattform.playbook.yml
+++ b/iac/ansible/02-plattform.playbook.yml
@ -0,0 +1,9 @@
+- name: Infrastruktur Dienste bereitstellen
+  hosts: all
+  gather_facts: true
+  roles:
+    - traefik
+    - authentik
+    - portainer
+    - leantime
+    - kestra
--- a/iac/ansible/assets/20auto-upgrades
+++ b/iac/ansible/assets/20auto-upgrades
@ -0,0 +1,4 @@
+APT::Periodic::Update-Package-Lists "1";
+APT::Periodic::Download-Upgradeable-Packages "1";
+APT::Periodic::AutocleanInterval "7";
+APT::Periodic::Unattended-Upgrade "1";
--- a/iac/ansible/assets/50unattended-upgrades
+++ b/iac/ansible/assets/50unattended-upgrades
@ -0,0 +1,9 @@
+Unattended-Upgrade::Allowed-Origins {
+        "${distro_id}:${distro_codename}-security";
+};
+Unattended-Upgrade::Package-Blacklist {
+};
+Unattended-Upgrade::DevRelease "false";
+Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
+Unattended-Upgrade::Remove-Unused-Dependencies "true";
+Unattended-Upgrade::Automatic-Reboot "false";
--- a/iac/ansible/group_vars/all.yml
+++ b/iac/ansible/group_vars/all.yml
@ -0,0 +1,12 @@
+admin_user: 'admin'
+ssh_port: 22
+cephfs_name: "shared-fs"
+ceph_osd_device: "/dev/sdb"
+public_interface: 'eth0'
+private_interface: 'enp7s0'
+authorized_keys:
+  - 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKtYTptTN9ggoy0aUKXmxaPKpclEn86jM7s5UtTw1JJI' # Marcels MacBook
+main_domain: genius.ceo
+
+ceph_volume: /mnt/cephfs
+traefik_public_net: traefik_public
--- a/iac/ansible/group_vars/secrets.yml
+++ b/iac/ansible/group_vars/secrets.yml
@ -0,0 +1,59 @@
+$ANSIBLE_VAULT;1.1;AES256
+34613362353964313436306439386661613364663666653265313937343239633365663836653030
+6262386661666364383961336461316139333262623034340a643434316632336132613264646437
+36376365613061353866383135353432303433353931633063313566613166303064316666613132
+3733623536643935370a656431646435626265666265666230356162656363663838636662313466
+61336237306332643032653766313036636163336431613236663864636438363832383231323362
+62666463336639303766356331353635323031636465616235663738333761653934346663386636
+63623361363164663663313966653939643462353638613464396466613931363662623763326535
+34376237353663656636363866373466346434666339646131396439653261373738636665613435
+65356330303863303236373933333163633964633061393136646632386137346434353365343763
+30343937656166303962653030366566616331666262343336343138623566353832313836643435
+62636333346235316562303061656166383135633464623734626336623565346336626134333933
+35363363376663333061663164623539363731613263376163306436636265336562396439356137
+30663431373131303437393166396539306133636264653733303762316363386438643536306338
+32303139303363316264393939326561393730396664343361393863303736343933636265633439
+65633765666362396439643863653531363366383866373939616333353430633530343262366138
+31663863663165653932653733623761613265383039336633383832393761666337336165613933
+63383934366662353038626539633132313939376231643133363739303235326433353733363437
+35626233613936626532326262646166363739666162353237323237383132333134343439336134
+33613462393237626432386462373439303439356666336630363536366233346438313039346530
+33393232333633663731393466653439623638316565346530306439326431323436356166633334
+66383034643834613133333265646338303463393035393266653832366434313636633730636436
+38353337633437656262623061666563646637626363353561323231376237623264373861376666
+66363265633638356133353933613664353934373634613662326437336562663766306364303538
+35623130616265623838353838396235386661666132623163383162373665313462663738303933
+63363764653561616162386139646130393439373066666437623236383238396233653165623032
+34316439376331356539626464313462616238623166623761626435303565653233386236656262
+62613935336661623862323833353265366533643830373634663266666332333463303666343366
+39653332346433306566316430656361363230343761613263393230366362363132663565636264
+65313633653464663963373561373532636235353331353237623635613034613337343730656632
+31656165666134333864353730363163623365393030333932393565666235643639303662663532
+38343734393135643039633664653966313536616533656635373535636434396333313536623536
+39623132326362656166366566373163386363336231633233353639313166333932656133363365
+66666665346331613638656562396463386637356539366539343232353061666531353166396536
+39623762633064323332653831643832303332396431633738396266633935656132323164613161
+61353663383532613763356630373063383161376165333736316466353231656534366636313636
+37616636383163616136643630363535346137636636633432643337393865393063626663333164
+36656537343231386333323637386539386364356266376433616636313239376666353066306363
+39376461323062393935613630656230346131373634363633393035346263663762623063356633
+36646664623230303761373138333164303363373365386266386138653764623030623630333631
+66363866633064656532336137613964653431663436333761666631656339646161636435343065
+37646164653937633962386631373236653064346438323664383933643738656536356562626532
+34663834363230303164626236393938643037363036613965373330636238633661346335336531
+62663461626365386362393061626266303463663735303539383937363965383234666337386165
+30366564363766623162306666656566353662633866396430396633623266383332303339666663
+38313536666336323366616432336161656434646463373963356331326364333038366337386638
+39396535386331663466323334613533383439343437363631363532313362663564353635343735
+37653063383163316366366335663537653134326564643062653065303337303333643961383837
+39393734326562616165313133643766303934336263326433366436623539633233643761616436
+33356234313538343635343630623337343436346638396539316131623861353630333964633839
+33316565326164386337623730623932313363306436316335336238333430626165663232343463
+36653038633632616335393262656638346434386639383131396233643932323931393264613134
+30336134343464373265636234656561653462356435383138323638613039623839373935326462
+32393430616438356332313766353337383035623137363233323664393833303464313162303833
+65383131313335353832343963636639346162353634306430353638393136623734623833306136
+32396130623065326636633235346630336435663261353866323862666231656261333839373162
+35623835663434356438653533623337363531353634663064303035633839656463656238636132
+66316333356633613130323438376530623634336632323365616239373865623334363635396331
+3263616336653336636666386632316564613331323431363935
--- a/iac/ansible/inventory.ini
+++ b/iac/ansible/inventory.ini
@ -0,0 +1,10 @@
+[all:children]
+managers
+workers
+
+[managers]
+manager-node-1 ansible_host=37.27.215.220 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
+manager-node-2 ansible_host=135.181.146.55 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
+manager-node-3 ansible_host=65.109.135.85 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
+
+[workers]
--- a/iac/ansible/playbook.yml
+++ b/iac/ansible/playbook.yml
@ -0,0 +1,10 @@
+---
+- name: Main-Playbook
+  hosts: all
+  gather_facts: true
+  roles:
+    # - traefik
+    # - portainer
+    # - kestra
+    - gitea
+
--- a/iac/ansible/resources/dockge/dockge.yml
+++ b/iac/ansible/resources/dockge/dockge.yml
@ -0,0 +1,33 @@
+networks:
+  traefik_public:
+    external: true
+
+services:
+  dockge:
+    image: louislam/dockge:1
+    environment:
+      - DOCKGE_STACKS_DIR=/opt/stacks
+      - DOCKGE_DATA_DIR=/app/data
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - '/mnt/cephfs/dockge/data:/app/data'
+      - '/mnt/cephfs/dockge/stacks:/opt/stacks'
+    networks:
+      - traefik_public
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+      restart_policy:
+        condition: on-failure
+      labels:
+        - 'traefik.enable=true'
+        - 'traefik.swarm.network=traefik_public'
+        # --- Router für Dockge ---
+        - 'traefik.http.routers.dockge.rule=Host(`dockge.genius.ceo`)'
+        - 'traefik.http.routers.dockge.entrypoints=https'
+        - 'traefik.http.routers.dockge.tls.certresolver=main'
+        # --- Service für Dockge ---
+        - 'traefik.http.services.dockge.loadbalancer.server.port=5001'
--- a/iac/ansible/resources/monitoring/base.yml
+++ b/iac/ansible/resources/monitoring/base.yml
@ -0,0 +1,22 @@
+
+services:
+  sd_server:
+    image: socheatsok78/dockerswarm_sd_server:latest
+    networks:
+      - sd_network
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+
+networks:
+  monitoring:
+    driver: overlay
+    attachable: true
+    ipam:
+      config:
+        - subnet: 172.16.201.0/24
+  sd_network:
+    driver: overlay
+    attachable: true
+    ipam:
+      config:
+        - subnet: 172.16.202.0/24
--- a/iac/ansible/resources/monitoring/config/alloy.v3.alloy
+++ b/iac/ansible/resources/monitoring/config/alloy.v3.alloy
@ -0,0 +1,139 @@
+faro.receiver "stage_app_agent_receiver" {
+  server {
+    listen_address = "0.0.0.0"
+    listen_port = 12347
+    cors_allowed_origins = ["*"]
+    // cors_allowed_origins = ["https://avicenna.genius.ceo"]
+    api_key = "t3stK3y"
+    max_allowed_payload_size = "10MiB"
+
+    rate_limiting {
+      rate = 100
+    }
+  }
+
+  sourcemaps {}
+
+  output {
+    logs = [loki.process.logs_process_client.receiver]
+    traces = [otelcol.exporter.otlp.tempo.input]
+  }
+}
+
+loki.process "logs_process_client" {
+    forward_to = [loki.write.to_loki.receiver]
+
+    stage.logfmt {
+        mapping = { "kind" = "", "service_name" = "", "app_name" = "", "namespace" = "" }
+    }
+
+    stage.labels {
+        values = { "kind" = "kind", "service_name" = "service_name", "app" = "app_name", "namespace" = "namespace" }
+    }
+}
+
+otelcol.receiver.otlp "otel_collector" {
+  grpc {
+    endpoint = "0.0.0.0:4317"
+  }
+  http {
+    endpoint = "0.0.0.0:4318"
+    cors {
+      allowed_origins = ["https://avicenna.genius.ceo/"]
+    }
+  }
+
+  // Definiert, wohin die empfangenen Daten weitergeleitet werden
+  output {
+    metrics = [otelcol.exporter.prometheus.otel_metrics.input]
+    logs    = [otelcol.exporter.loki.otel_logs.input]
+    traces  = [otelcol.exporter.otlp.tempo.input]
+  }
+}
+
+loki.write "to_loki" {
+  endpoint {
+    url = "http://loki:3100/loki/api/v1/push"
+  }
+}
+
+prometheus.remote_write "to_prometheus" {
+  endpoint {
+    url = "http://prometheus:9090/api/v1/write"
+  }
+}
+
+// Docker-Container auf dem Host entdecken
+discovery.docker "logs_integration_docker" {
+  host = "unix:///var/run/docker.sock"
+  refresh_interval = "5s"
+}
+discovery.relabel "logs_integration_docker" {
+  targets = []
+
+  rule {
+    action = "labelmap"
+    regex = "__meta_docker_container_label_com_docker_swarm_node_id"
+    replacement = "node_id"
+  }
+
+  rule {
+    action = "labelmap"
+    regex = "__meta_docker_container_label_com_docker_stack_namespace"
+    replacement = "namespace"
+  }
+
+  rule {
+    action = "labelmap"
+    regex = "__meta_docker_container_label_com_docker_swarm_service_name"
+    replacement = "service_name"
+  }
+
+  rule {
+    action = "labelmap"
+    regex = "__meta_docker_container_name"
+    replacement = "container_name"
+  }
+}
+
+loki.source.docker "logs_from_containers" {
+  host    = "unix:///var/run/docker.sock"
+  targets = discovery.docker.logs_integration_docker.targets // Nutzt die entdeckten Container
+
+  relabel_rules = discovery.relabel.logs_integration_docker.rules
+
+  // Leitet die gesammelten Logs an den definierten Loki-Endpunkt weiter
+  forward_to = [loki.write.to_loki.receiver]
+}
+
+otelcol.exporter.otlp "tempo" { // Name kann variieren
+  client {
+    endpoint = "tempo:4317" // Ziel: Tempo Service auf Port 4317
+    tls {
+      insecure = true // Interne Kommunikation ohne TLS
+    }
+  }
+}
+
+otelcol.exporter.prometheus "otel_metrics" {
+  forward_to = [prometheus.remote_write.to_prometheus.receiver]
+}
+
+otelcol.exporter.loki "otel_logs" {
+  forward_to = [loki.write.to_loki.receiver]
+}
+
+// Logging für Alloy selbst konfigurieren
+logging {
+  level  = "info"
+  format = "logfmt"
+}
+
+// prometheus.scrape "alloy_self" {
+//   targets = [
+//     prometheus.target_group {
+//       targets = [{"__address__" = "localhost:12345"}]
+//     }
+//   ]
+//   forward_to = [...] // An Prometheus Remote Write oder lokalen Agent
+// }
--- a/iac/ansible/resources/monitoring/config/loki.v1.yml
+++ b/iac/ansible/resources/monitoring/config/loki.v1.yml
@ -0,0 +1,76 @@
+auth_enabled: false # Einfachste Konfiguration ohne Authentifizierung
+analytics:
+  reporting_enabled: false
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096 # Standard gRPC Port für Loki
+
+common:
+  instance_addr: 127.0.0.1 # Adresse, unter der sich die Instanz meldet
+  path_prefix: /loki   # Wo Loki seine Daten speichert (im Volume)
+  storage:
+    filesystem: # Lokales Dateisystem für Indizes und Chunks
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1 # Keine Replikation bei Einzelinstanz
+  ring:
+    kvstore:
+      store: inmemory # Einfachster Ring-Speicher für Einzelinstanz
+
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+pattern_ingester:
+  enabled: true
+  metric_aggregation:
+    loki_address: localhost:3100
+
+frontend:
+  encoding: protobuf
+
+limits_config:
+  metric_aggregation_enabled: true
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h # 7 Tage
+  ingestion_rate_mb: 15 # Erlaube 15 MiB/Sekunde pro Tenant (Standard war 4)
+  ingestion_burst_size_mb: 30 # Erlaube kurzfristige Bursts bis 30 MiB (Standard war 6)
+  # Optional: Maximale Anzahl aktiver Log-Streams pro Tenant (Standard ist 10000)
+  # max_global_streams_per_user: 10000
+  # Optional: Maximale Größe einer Log-Zeile (Standard 256kB)
+  # max_line_size: 262144
+
+# --- Optional: Compactor (Bereinigt alte Daten) ---
+# compactor:
+#   working_directory: /loki/compactor
+#   shared_store: filesystem
+#   compaction_interval: 10m
+#   retention_enabled: true
+#   retention_delete_delay: 2h
+#   retention_delete_worker_count: 150
+
+# --- Optional: Ruler (für Alerts basierend auf Logs) ---
+# ruler:
+#   alertmanager_url: http://alertmanager:9093 # Pfad zu deinem Alertmanager
+#   storage:
+#     type: local
+#     local:
+#       directory: /loki/rules
+#   rule_path: /tmp/loki/rules-temp
+#   ring:
+#     kvstore:
+#       store: inmemory
+#   enable_api: true
--- a/iac/ansible/resources/monitoring/config/prometheus.v3.yml
+++ b/iac/ansible/resources/monitoring/config/prometheus.v3.yml
@ -0,0 +1,57 @@
+global:
+  scrape_interval: 15s # Wie oft Ziele abgefragt werden
+  evaluation_interval: 15s # Wie oft Regeln ausgewertet werden
+
+scrape_configs:
+  - job_name: 'prometheus'
+    # Prometheus überwacht sich selbst
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'node-exporter'
+    # Docker Swarm Service Discovery für den Node Exporter
+    dockerswarm_sd_configs:
+      - host: unix:///var/run/docker.sock
+        role: tasks
+        port: 9100 # Standard-Port vom Node Exporter
+    relabel_configs:
+      # Nur Tasks im Zustand 'running' verwenden
+      - source_labels: [__meta_dockerswarm_task_desired_state]
+        regex: running
+        action: keep
+      # Nur Tasks des 'node-exporter' Services aus diesem Stack auswählen
+      # Passe den Regex ggf. an, wenn dein Stack anders heißt (hier Annahme: Stack-Name enthält 'monitoring')
+      - source_labels: [__meta_dockerswarm_service_name]
+        regex: ^monitoring_node-exporter$ # Regex an Stack-Namen anpassen!
+        action: keep
+      # Verwende den Hostnamen des Swarm Nodes als Instance Label
+      - source_labels: [__meta_dockerswarm_node_hostname]
+        target_label: instance
+      # Setze die Zieladresse korrekt auf IP:Port
+      - source_labels: [__address__]
+        regex: '(.*):.*' # Extrahiere die IP-Adresse
+        replacement: '${1}:9100' # Setze den korrekten Port (9100)
+        target_label: __address__
+
+  - job_name: 'cadvisor'
+    dockerswarm_sd_configs:
+      - host: unix:///var/run/docker.sock
+        role: tasks
+        port: 8080 # Standard-Port von cAdvisor
+    relabel_configs:
+      # Nur Tasks im Zustand 'running' verwenden
+      - source_labels: [__meta_dockerswarm_task_desired_state]
+        regex: running
+        action: keep
+      # Nur Tasks des 'cadvisor' Services aus diesem Stack auswählen
+      # Passe den Regex an deinen Stack-Namen an!
+      - source_labels: [__meta_dockerswarm_service_name]
+        regex: .*(monitoring|mon)_cadvisor.* # Regex an Stack-Namen anpassen!
+        action: keep
+      # Verwende den Hostnamen des Swarm Nodes als Instance Label
+      - source_labels: [__meta_dockerswarm_node_hostname]
+        target_label: instance
+      # WICHTIG: Setze den Metrik-Pfad, da cAdvisor ihn unter /metrics bereitstellt
+      - action: replace
+        target_label: __metrics_path__
+        replacement: /metrics
--- a/iac/ansible/resources/monitoring/config/promtail.v1.yml
+++ b/iac/ansible/resources/monitoring/config/promtail.v1.yml
@ -0,0 +1,38 @@
+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+
+positions:
+  filename: /mnt/promtail/positions.yaml # Pfad im gemounteten Volume
+
+clients:
+  - url: http://loki:3100/loki/api/v1/push # Sendet Logs an den Loki-Service
+
+scrape_configs:
+  - job_name: docker_containers
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        refresh_interval: 5s
+    relabel_configs:
+      # Extrahiere Container-Name (ohne '/')
+      - source_labels: ['__meta_docker_container_name']
+        regex: '/(.*)'
+        target_label: 'container_name'
+      # Behalte den Log-Stream (stdout/stderr) als Label
+      - source_labels: ['__meta_docker_container_log_stream']
+        target_label: 'logstream'
+      # Extrahiere Service-Name aus Swarm-Label
+      - source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
+        target_label: 'service_name'
+      # Extrahiere Task-Name aus Swarm-Label
+      - source_labels: ['__meta_docker_container_label_com_docker_swarm_task_name']
+        target_label: 'task_name'
+      # Füge 'instance'-Label mit dem Hostnamen des Tasks hinzu (Annäherung an Node-Namen)
+      - action: replace
+        source_labels: ['container_name'] # Braucht ein existierendes Label als Quelle
+        target_label: 'instance'
+        replacement: ${HOSTNAME} # Nutzt Swarm HOSTNAME Variable
+      # Verwerfe Logs von Promtail selbst (Regex ggf. an Stacknamen anpassen)
+      - source_labels: ['container_name']
+        regex: 'monitoring_promtail.*' # Passe 'monitoring' an deinen Stack-Namen an!
+        action: drop
--- a/iac/ansible/resources/monitoring/config/tempo.v1.yml
+++ b/iac/ansible/resources/monitoring/config/tempo.v1.yml
@ -0,0 +1,36 @@
+server:
+  http_listen_port: 3200 # Standard API/UI Port
+
+distributor:
+  receivers:  # OTLP receiver aktivieren (Tempo kann auch direkt empfangen)
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+        http:
+          endpoint: 0.0.0.0:4318
+
+# Grundlegende Konfiguration für Datenverarbeitung (meist ok für Start)
+ingester:
+  trace_idle_period: 10s
+  max_block_bytes: 1048576 # 1MB
+  max_block_duration: 5m
+
+compactor:
+  compaction:
+    block_retention: 1h # Wie lange Blöcke mindestens aufheben (geringer Wert für Test)
+
+# WICHTIG: Storage explizit definieren!
+storage:
+  trace:
+    backend: local       # Backend-Typ: lokales Dateisystem
+    # Write Ahead Log (WAL) configuration.
+    wal:
+      path: /tmp/tempo/wal             # Directory to store the the WAL locally.
+    # Local configuration for filesystem storage.
+    local:
+      path: /tmp/tempo/blocks          # Directory to store the TSDB blocks.
+    # Pool used for finding trace IDs.
+    pool:
+      max_workers: 100                 # Worker pool determines the number of parallel requests to the object store backend.
+      queue_depth: 10000               # Maximum depth for the querier queue jobs. A job is required for each block searched.
--- a/iac/ansible/resources/monitoring/observability.yml
+++ b/iac/ansible/resources/monitoring/observability.yml
@ -0,0 +1,226 @@
+configs:
+  alloy-config-v3:
+    file: /srv/monitoring/config/alloy.v3.alloy
+  loki-config-v1:
+    file: /srv/monitoring/config/loki.v1.yml
+  prometheus-config-v3:
+    file: /srv/monitoring/config/prometheus.v3.yml
+  tempo-config-v1:
+    file: /srv/monitoring/config/tempo.v1.yml
+
+volumes:
+  prometheus-data:
+    driver: local
+  grafana-data:
+    driver: local
+  loki-data:
+    driver: local
+  alloy-data:
+    driver: local
+  tempo-data:
+    driver: local
+
+networks:
+  monitoring-net: # Internes Overlay-Netzwerk für die Monitoring-Komponenten
+    driver: overlay
+    attachable: true # Erlaubt anderen Containern/Stacks ggf. den Zugriff
+  traefik_public: # Das externe Netzwerk, auf dem Traefik lauscht
+    external: true # Wichtig: Dieses Netzwerk wird NICHT von diesem Stack erstellt
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    user: "65534:988"
+    volumes:
+      - prometheus-data:/prometheus
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    configs:
+      - source: prometheus-config-v3 # Versionierte Config
+        target: /etc/prometheus/prometheus.yml
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
+      - '--web.enable-lifecycle'
+      - '--web.enable-remote-write-receiver'
+    networks:
+      - monitoring-net
+      - traefik_public # Nur wenn Traefik direkt auf Prometheus zugreifen soll (optional)
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager # Optional: An Manager-Nodes binden
+      labels:
+        - "traefik.enable=true"
+        # --- Router für Prometheus UI ---
+        - "traefik.http.routers.prometheus.rule=Host(`prometheus.genius.ceo`)"
+        - "traefik.http.routers.prometheus.entrypoints=https" # Entrypoint anpassen, falls anders
+        - "traefik.http.routers.prometheus.tls.certresolver=main" # CertResolver anpassen!
+        # --- Service für Prometheus UI ---
+        - "traefik.http.services.prometheus.loadbalancer.server.port=9090"
+        # --- Middleware (optional, z.B. für Authentifizierung) ---
+        # - "traefik.http.routers.prometheus.middlewares=my-auth-middleware"
+        # --- Netzwerk für Traefik ---
+        # WICHTIG: Das Netzwerk muss existieren und Traefik muss darauf lauschen.
+        - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen!
+
+  loki:
+    image: grafana/loki:latest
+    volumes:
+      - loki-data:/loki
+    configs:
+      - source: loki-config-v1
+        target: /etc/loki/local-config.yaml
+    command: "-config.file=/etc/loki/local-config.yaml"
+    networks:
+      - monitoring-net
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+
+  tempo:
+    image: grafana/tempo:latest # Aktuelles Tempo Image
+    volumes:
+      - tempo-data:/tmp/tempo # Persistenter Speicher für Traces (Standardpfad)
+    configs:
+      - source: tempo-config-v1
+        target: /etc/tempo/tempo.yaml
+    command: [ "-config.file=/etc/tempo/tempo.yaml" ]
+    user: root
+    # Tempo lauscht intern auf verschiedenen Ports für verschiedene Protokolle:
+    # - 4317 (OTLP gRPC - wird von Alloy genutzt)
+    # - 4318 (OTLP HTTP)
+    # - 14268 (Jaeger gRPC)
+    # - 3200 (Tempo HTTP Frontend/API - für Grafana & UI)
+    # Wir mappen sie vorerst nicht nach außen.
+    networks:
+      - monitoring-net
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager # Optional: An Manager-Nodes binden
+
+  grafana:
+    image: grafana/grafana:latest
+    volumes:
+      - grafana-data:/var/lib/grafana
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin # Besser über Docker Secrets lösen!
+      # Weitere Grafana env vars nach Bedarf
+    networks:
+      - monitoring-net
+      - traefik_public # Nur wenn Traefik direkt auf Grafana zugreifen soll (optional)
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager # Optional: An Manager-Nodes binden
+      labels:
+        - "traefik.enable=true"
+        # --- Router für Grafana ---
+        - "traefik.http.routers.grafana.rule=Host(`grafana.genius.ceo`)"
+        - "traefik.http.routers.grafana.entrypoints=https" # Entrypoint anpassen, falls anders
+        - "traefik.http.routers.grafana.tls.certresolver=main" # CertResolver anpassen!
+        # --- Service für Grafana ---
+        - "traefik.http.services.grafana.loadbalancer.server.port=3000"
+        # --- Middleware (optional) ---
+        # - "traefik.http.routers.grafana.middlewares=my-auth-middleware"
+        # --- Netzwerk für Traefik ---
+        - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen!
+
+  alloy:
+    image: grafana/alloy:latest # Offizielles Alloy Image
+    volumes:
+      - alloy-data:/var/lib/alloy/data # Persistenter Speicher für Alloy (WAL etc.)
+      - /var/run/docker.sock:/var/run/docker.sock:ro # Für Docker Discovery
+    configs:
+      - source: alloy-config-v3
+        target: /etc/alloy/config.alloy # S3-Pfad für Alloy Config
+    environment:
+      - HOSTNAME=${HOSTNAME}
+    # Start mit root wegen Docker Socket / Volume Permissions, kann später optimiert werden (Socket Proxy)
+    # user: root
+    command: [
+      "run",
+      "--server.http.listen-addr=0.0.0.0:12345",
+      "/etc/alloy/config.alloy",
+    ]
+    networks:
+      - monitoring-net
+      - traefik_public
+    deploy:
+      mode: global # WICHTIG: Alloy muss auf jedem Node laufen!
+      labels: # Traefik Labels für Alloy UI
+        - "traefik.enable=true"
+        # --- Router für Alloy UI ---
+        - "traefik.http.routers.alloy-ui.rule=Host(`otlp.genius.ceo`)"
+        - "traefik.http.routers.alloy-ui.entrypoints=https"
+        - "traefik.http.routers.alloy-ui.tls.certresolver=main"
+        - "traefik.http.routers.alloy-ui.service=alloy-ui@swarm"
+        # --- Service für Alloy UI ---
+        - "traefik.http.services.alloy-ui.loadbalancer.server.port=12345" # Ziel-Port ist 12345 (Alloy UI Standard)
+        # # --- Router für OTLP HTTP ---
+        # - "traefik.http.routers.otlp-http.rule=Host(`alloy.genius.ceo`)"
+        # - "traefik.http.routers.otlp-http.entrypoints=https"
+        # - "traefik.http.routers.otlp-http.tls.certresolver=main"
+        # - "traefik.http.routers.otlp-http.service=otlp-http@swarm"
+        # # --- Service für OTLP HTTP ---
+        # - "traefik.http.services.otlp-http.loadbalancer.server.port=4318" # Ziel-Port ist 4318 (OTLP HTTP Standard)
+        # --- Router für FARO RECEIVER ---
+        - "traefik.http.routers.faro-receiver.rule=Host(`alloy.genius.ceo`)"
+        - "traefik.http.routers.faro-receiver.entrypoints=https"
+        - "traefik.http.routers.faro-receiver.tls.certresolver=main"
+        - "traefik.http.routers.faro-receiver.service=faro-receiver@swarm"
+        # --- Service für FARO RECEIVER ---
+        - "traefik.http.services.faro-receiver.loadbalancer.server.port=12347" # Ziel-Port ist 12347 (FARO RECEIVER Standard)
+        # # --- Middlewares ---
+        # - "traefik.http.routers.otlp-http.middlewares=alloy-ratelimit@swarm"
+        # - "traefik.http.middlewares.alloy-ratelimit.ratelimit.average=100" # z.B. 100 Anfragen pro Sekunde
+        # - "traefik.http.middlewares.alloy-ratelimit.ratelimit.burst=50"  # kurzfristig 50 mehr erlaubt
+        # --- Netzwerk für Traefik ---
+        - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname prüfen/anpassen!
+
+  node-exporter:
+    image: quay.io/prometheus/node-exporter:latest # Aktuelles Image verwenden
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--path.rootfs=/rootfs'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
+    networks:
+      - monitoring-net # Nur internes Netzwerk nötig
+    deploy:
+      mode: global # Läuft auf JEDEM Node im Swarm
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:latest # Google's cAdvisor Image
+    volumes:
+      # cAdvisor braucht Zugriff auf Host-System-Infos und Docker
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    networks:
+      - monitoring-net # Nur internes Netzwerk nötig
+    deploy:
+      mode: global # Läuft auf JEDEM Node im Swarm
+      resources: # Optional: Limitiert Ressourcen, cAdvisor kann hungrig sein
+        limits:
+          memory: 512M
+        reservations:
+          memory: 256M
--- a/iac/ansible/resources/signoz-infra/config/otel-agent-config.v1.yaml
+++ b/iac/ansible/resources/signoz-infra/config/otel-agent-config.v1.yaml
@ -0,0 +1,102 @@
+receivers:
+  hostmetrics:
+    collection_interval: 30s
+    root_path: /hostfs
+    scrapers:
+      cpu: {}
+      load: {}
+      memory: {}
+      disk: {}
+      filesystem: {}
+      network: {}
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+  prometheus:
+    config:
+      global:
+        scrape_interval: 60s
+      scrape_configs:
+        - job_name: otel-agent
+          static_configs:
+          - targets:
+              - localhost:8888
+            labels:
+              job_name: otel-agent
+  tcplog/docker:
+    listen_address: "0.0.0.0:2255"
+    operators:
+      - type: regex_parser
+        regex: '^<([0-9]+)>[0-9]+ (?P<timestamp>[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?) (?P<container_id>\S+) (?P<container_name>\S+) [0-9]+ - -( (?P<body>.*))?'
+        timestamp:
+          parse_from: attributes.timestamp
+          layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+      - type: move
+        from: attributes["body"]
+        to: body
+      - type: remove
+        field: attributes.timestamp
+      # please remove names from below if you want to collect logs from them
+      - type: filter
+        id: signoz_logs_filter
+        expr: 'attributes.container_name matches "^(signoz_(logspout|signoz|otel-collector|clickhouse|zookeeper))|(infra_(logspout|otel-agent|otel-metrics)).*"'
+processors:
+  batch:
+    send_batch_size: 10000
+    send_batch_max_size: 11000
+    timeout: 10s
+  resourcedetection:
+    # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
+    detectors:
+      # - ec2
+      # - gcp
+      # - azure
+      - env
+      - system
+    timeout: 2s
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  pprof:
+    endpoint: 0.0.0.0:1777
+exporters:
+  otlp:
+    endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT}
+    tls:
+      insecure: true
+    headers:
+      signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN}
+  # debug: {}
+service:
+  telemetry:
+    logs:
+      encoding: json
+    metrics:
+      address: 0.0.0.0:8888
+  extensions:
+    - health_check
+    - pprof
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
+    metrics:
+      receivers: [otlp]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
+    metrics/hostmetrics:
+      receivers: [hostmetrics]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
+    metrics/prometheus:
+      receivers: [prometheus]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
+    logs:
+      receivers: [otlp, tcplog/docker]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
--- a/iac/ansible/resources/signoz-infra/config/otel-metrics-config.v1.yaml
+++ b/iac/ansible/resources/signoz-infra/config/otel-metrics-config.v1.yaml
@ -0,0 +1,103 @@
+receivers:
+  prometheus:
+    config:
+      global:
+        scrape_interval: 60s
+      scrape_configs:
+        - job_name: otel-metrics
+          static_configs:
+          - targets:
+              - localhost:8888
+            labels:
+              job_name: otel-metrics
+        # For Docker daemon metrics to be scraped, it must be configured to expose
+        # Prometheus metrics, as documented here: https://docs.docker.com/config/daemon/prometheus/
+        # - job_name: docker-daemon
+        #   dockerswarm_sd_configs:
+        #     - host: unix:///var/run/docker.sock
+        #       role: nodes
+        #   relabel_configs:
+        #     - source_labels: [__meta_dockerswarm_node_address]
+        #       target_label: __address__
+        #       replacement: $1:9323
+        - job_name: "dockerswarm"
+          dockerswarm_sd_configs:
+            - host: unix:///var/run/docker.sock
+              role: tasks
+          relabel_configs:
+            - action: keep
+              regex: running
+              source_labels:
+                - __meta_dockerswarm_task_desired_state
+            - action: keep
+              regex: true
+              source_labels:
+                - __meta_dockerswarm_service_label_signoz_io_scrape
+            - regex: ([^:]+)(?::\d+)?
+              replacement: $1
+              source_labels:
+                - __address__
+              target_label: swarm_container_ip
+            - separator: .
+              source_labels:
+                - __meta_dockerswarm_service_name
+                - __meta_dockerswarm_task_slot
+                - __meta_dockerswarm_task_id
+              target_label: swarm_container_name
+            - target_label: __address__
+              source_labels:
+                - swarm_container_ip
+                - __meta_dockerswarm_service_label_signoz_io_port
+              separator: ":"
+            - source_labels:
+                - __meta_dockerswarm_service_label_signoz_io_path
+              target_label: __metrics_path__
+            - source_labels:
+              - __meta_dockerswarm_service_label_com_docker_stack_namespace
+              target_label: namespace
+            - source_labels:
+                - __meta_dockerswarm_service_name
+              target_label: service_name
+            - source_labels:
+                - __meta_dockerswarm_task_id
+              target_label: service_instance_id
+            - source_labels:
+                - __meta_dockerswarm_node_hostname
+              target_label: host_name
+processors:
+  batch:
+    send_batch_size: 10000
+    send_batch_max_size: 11000
+    timeout: 10s
+  resourcedetection:
+    detectors:
+      - env
+      - system
+    timeout: 2s
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  pprof:
+    endpoint: 0.0.0.0:1777
+exporters:
+  otlp:
+    endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT}
+    tls:
+      insecure: true
+    headers:
+      signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN}
+  # debug: {}
+service:
+  telemetry:
+    logs:
+      encoding: json
+    metrics:
+      address: 0.0.0.0:8888
+  extensions:
+    - health_check
+    - pprof
+  pipelines:
+    metrics:
+      receivers: [prometheus]
+      processors: [resourcedetection, batch]
+      exporters: [otlp]
--- a/iac/ansible/resources/signoz-infra/signoz-infra.yml
+++ b/iac/ansible/resources/signoz-infra/signoz-infra.yml
@ -0,0 +1,78 @@
+version: "3"
+x-common: &common
+  networks:
+    - signoz-net
+  extra_hosts:
+    - host.docker.internal:host-gateway
+  logging:
+    options:
+      max-size: 50m
+      max-file: "3"
+  deploy:
+    mode: global
+    restart_policy:
+      condition: on-failure
+services:
+  otel-agent:
+    <<: *common
+    image: otel/opentelemetry-collector-contrib:0.111.0
+    command:
+      - --config=/etc/otel-collector-config.yaml
+    configs:
+      - source: otel-agent-config-v1
+        target: /etc/otel-collector-config.yaml
+    volumes:
+      - /:/hostfs:ro
+    environment:
+      - SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317    # In case of external SigNoz or cloud, update the endpoint and access token
+      - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
+      # - SIGNOZ_ACCESS_TOKEN="<your-access-token>"
+    # Before exposing the ports, make sure the ports are not used by other services
+    # ports:
+    #   - "4317:4317"
+    #   - "4318:4318"
+  otel-metrics:
+    <<: *common
+    image: otel/opentelemetry-collector-contrib:0.111.0
+    user: 0:0   #  If you have security concerns, you can replace this with your `UID:GID` that has necessary permissions to docker.sock
+    command:
+      - --config=/etc/otel-collector-config.yaml
+    configs:
+      - source: otel-metrics-config-v1
+        target: /etc/otel-collector-config.yaml
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    environment:
+      - SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317    # In case of external SigNoz or cloud, update the endpoint and access token
+      - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
+      # - SIGNOZ_ACCESS_TOKEN="<your-access-token>"
+    # Before exposing the ports, make sure the ports are not used by other services
+    # ports:
+    #   - "4317:4317"
+    #   - "4318:4318"
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+  logspout:
+    <<: *common
+    image: "gliderlabs/logspout:v3.2.14"
+    command: syslog+tcp://otel-agent:2255
+    user: root
+    volumes:
+      - /etc/hostname:/etc/host_hostname:ro
+      - /var/run/docker.sock:/var/run/docker.sock
+    depends_on:
+      - otel-agent
+
+networks:
+  signoz-net:
+    name: signoz-net
+    external: true
+configs:
+  otel-metrics-config-v1:
+    file: /mnt/cephfs/signoz-infra/config/otel-metrics-config.v1.yaml
+  otel-agent-config-v1:
+    file: /mnt/cephfs/signoz-infra/config/otel-agent-config.v1.yaml
--- a/iac/ansible/resources/signoz/config/clickhouse/cluster.v1.xml
+++ b/iac/ansible/resources/signoz/config/clickhouse/cluster.v1.xml
@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<clickhouse>
+    <!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
+         Optional. If you don't use replicated tables, you could omit that.
+
+         See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
+      -->
+    <zookeeper>
+        <node index="1">
+            <host>zookeeper-1</host>
+            <port>2181</port>
+        </node>
+        <!-- <node index="2">
+            <host>zookeeper-2</host>
+            <port>2181</port>
+        </node>
+        <node index="3">
+            <host>zookeeper-3</host>
+            <port>2181</port>
+        </node> -->
+    </zookeeper>
+
+    <!-- Configuration of clusters that could be used in Distributed tables.
+         https://clickhouse.com/docs/en/operations/table_engines/distributed/
+      -->
+    <remote_servers>
+        <cluster>
+            <!-- Inter-server per-cluster secret for Distributed queries
+                 default: no secret (no authentication will be performed)
+
+                 If set, then Distributed queries will be validated on shards, so at least:
+                 - such cluster should exist on the shard,
+                 - such cluster should have the same secret.
+
+                 And also (and which is more important), the initial_user will
+                 be used as current user for the query.
+
+                 Right now the protocol is pretty simple and it only takes into account:
+                 - cluster name
+                 - query
+
+                 Also it will be nice if the following will be implemented:
+                 - source hostname (see interserver_http_host), but then it will depends from DNS,
+                   it can use IP address instead, but then the you need to get correct on the initiator node.
+                 - target hostname / ip address (same notes as for source hostname)
+                 - time-based security tokens
+            -->
+            <!-- <secret></secret> -->
+            <shard>
+                <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
+                <!-- <internal_replication>false</internal_replication> -->
+                <!-- Optional. Shard weight when writing data. Default: 1. -->
+                <!-- <weight>1</weight> -->
+                <replica>
+                    <host>clickhouse</host>
+                    <port>9000</port>
+                    <!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
+                    <!-- <priority>1</priority> -->
+                </replica>
+            </shard>
+            <!-- <shard>
+                <replica>
+                    <host>clickhouse-2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>clickhouse-3</host>
+                    <port>9000</port>
+                </replica>
+            </shard> -->
+        </cluster>
+    </remote_servers>
+</clickhouse>
--- a/iac/ansible/resources/signoz/config/clickhouse/config.v1.xml
+++ b/iac/ansible/resources/signoz/config/clickhouse/config.v1.xml
--- a/iac/ansible/resources/signoz/config/clickhouse/custom-function.v1.xml
+++ b/iac/ansible/resources/signoz/config/clickhouse/custom-function.v1.xml
@ -0,0 +1,21 @@
+<functions>
+    <function>
+        <type>executable</type>
+        <name>histogramQuantile</name>
+        <return_type>Float64</return_type>
+        <argument>
+            <type>Array(Float64)</type>
+            <name>buckets</name>
+        </argument>
+        <argument>
+            <type>Array(Float64)</type>
+            <name>counts</name>
+        </argument>
+        <argument>
+            <type>Float64</type>
+            <name>quantile</name>
+        </argument>
+        <format>CSV</format>
+        <command>./histogramQuantile</command>
+    </function>
+</functions>
--- a/iac/ansible/resources/signoz/config/clickhouse/storage.v1.xml
+++ b/iac/ansible/resources/signoz/config/clickhouse/storage.v1.xml
@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<clickhouse>
+<storage_configuration>
+    <disks>
+        <default>
+            <keep_free_space_bytes>10485760</keep_free_space_bytes>
+        </default>
+        <s3>
+            <type>s3</type>
+            <!-- For S3 cold storage,
+                    if region is us-east-1, endpoint can be https://<bucket-name>.s3.amazonaws.com
+                    if region is not us-east-1, endpoint should be https://<bucket-name>.s3-<region>.amazonaws.com
+                For GCS cold storage,
+                    endpoint should be https://storage.googleapis.com/<bucket-name>/data/
+                -->
+            <endpoint>https://BUCKET-NAME.s3-REGION-NAME.amazonaws.com/data/</endpoint>
+            <access_key_id>ACCESS-KEY-ID</access_key_id>
+            <secret_access_key>SECRET-ACCESS-KEY</secret_access_key>
+            <!-- In case of S3, uncomment the below configuration in case you want to read
+                AWS credentials from the Environment variables if they exist. -->
+            <!-- <use_environment_credentials>true</use_environment_credentials> -->
+            <!-- In case of GCS, uncomment the below configuration, since GCS does
+                not support batch deletion and result in error messages in logs. -->
+            <!-- <support_batch_delete>false</support_batch_delete> -->
+        </s3>
+   </disks>
+   <policies>
+       <tiered>
+           <volumes>
+                <default>
+                    <disk>default</disk>
+                </default>
+                <s3>
+                    <disk>s3</disk>
+                    <perform_ttl_move_on_insert>0</perform_ttl_move_on_insert>
+                </s3>
+            </volumes>
+        </tiered>
+    </policies>
+</storage_configuration>
+</clickhouse>
--- a/iac/ansible/resources/signoz/config/clickhouse/users.v1.xml
+++ b/iac/ansible/resources/signoz/config/clickhouse/users.v1.xml
@ -0,0 +1,123 @@
+<?xml version="1.0"?>
+<clickhouse>
+    <!-- See also the files in users.d directory where the settings can be overridden. -->
+
+    <!-- Profiles of settings. -->
+    <profiles>
+        <!-- Default settings. -->
+        <default>
+            <!-- Maximum memory usage for processing single query, in bytes. -->
+            <max_memory_usage>10000000000</max_memory_usage>
+
+            <!-- How to choose between replicas during distributed query processing.
+                 random - choose random replica from set of replicas with minimum number of errors
+                 nearest_hostname - from set of replicas with minimum number of errors, choose replica
+                  with minimum number of different symbols between replica's hostname and local hostname
+                  (Hamming distance).
+                 in_order - first live replica is chosen in specified order.
+                 first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
+            -->
+            <load_balancing>random</load_balancing>
+        </default>
+
+        <!-- Profile that allows only read queries. -->
+        <readonly>
+            <readonly>1</readonly>
+        </readonly>
+    </profiles>
+
+    <!-- Users and ACL. -->
+    <users>
+        <!-- If user name was not specified, 'default' user is used. -->
+        <default>
+            <!-- See also the files in users.d directory where the password can be overridden.
+
+                 Password could be specified in plaintext or in SHA256 (in hex format).
+
+                 If you want to specify password in plaintext (not recommended), place it in 'password' element.
+                 Example: <password>qwerty</password>.
+                 Password could be empty.
+
+                 If you want to specify SHA256, place it in 'password_sha256_hex' element.
+                 Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
+                 Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
+
+                 If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
+                 Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
+
+                 If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
+                  place its name in 'server' element inside 'ldap' element.
+                 Example: <ldap><server>my_ldap_server</server></ldap>
+
+                 If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
+                  place 'kerberos' element instead of 'password' (and similar) elements.
+                 The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
+                 You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
+                  whose initiator's realm matches it.
+                 Example: <kerberos />
+                 Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
+
+                 How to generate decent password:
+                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
+                 In first line will be password and in second - corresponding SHA256.
+
+                 How to generate double SHA1:
+                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
+                 In first line will be password and in second - corresponding double SHA1.
+            -->
+            <password></password>
+
+            <!-- List of networks with open access.
+
+                 To open access from everywhere, specify:
+                    <ip>::/0</ip>
+
+                 To open access only from localhost, specify:
+                    <ip>::1</ip>
+                    <ip>127.0.0.1</ip>
+
+                 Each element of list has one of the following forms:
+                 <ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
+                     2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
+                 <host> Hostname. Example: server01.clickhouse.com.
+                     To check access, DNS query is performed, and all received addresses compared to peer address.
+                 <host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
+                     To check access, DNS PTR query is performed for peer address and then regexp is applied.
+                     Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
+                     Strongly recommended that regexp is ends with $
+                 All results of DNS requests are cached till server restart.
+            -->
+            <networks>
+                <ip>::/0</ip>
+            </networks>
+
+            <!-- Settings profile for user. -->
+            <profile>default</profile>
+
+            <!-- Quota for user. -->
+            <quota>default</quota>
+
+            <!-- User can create other users and grant rights to them. -->
+            <!-- <access_management>1</access_management> -->
+        </default>
+    </users>
+
+    <!-- Quotas. -->
+    <quotas>
+        <!-- Name of quota. -->
+        <default>
+            <!-- Limits for time interval. You could specify many intervals with different limits. -->
+            <interval>
+                <!-- Length of interval. -->
+                <duration>3600</duration>
+
+                <!-- No limits. Just calculate resource usage for time interval. -->
+                <queries>0</queries>
+                <errors>0</errors>
+                <result_rows>0</result_rows>
+                <read_rows>0</read_rows>
+                <execution_time>0</execution_time>
+            </interval>
+        </default>
+    </quotas>
+</clickhouse>
--- a/iac/ansible/resources/signoz/config/otel-collector-config.v4.yaml
+++ b/iac/ansible/resources/signoz/config/otel-collector-config.v4.yaml
@ -0,0 +1,140 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+        cors:
+          allowed_origins:
+            - https://*.genius.ceo
+            - https://*.avicenna.hamburg
+  prometheus:
+    config:
+      global:
+        scrape_interval: 60s
+      scrape_configs:
+        - job_name: otel-collector
+          static_configs:
+          - targets:
+              - localhost:8888
+            labels:
+              job_name: otel-collector
+  docker_stats:
+    endpoint: unix:///var/run/docker.sock
+    metrics:
+      container.cpu.utilization:
+        enabled: true
+      container.memory.percent:
+        enabled: true
+      container.network.io.usage.rx_bytes:
+        enabled: true
+      container.network.io.usage.tx_bytes:
+        enabled: true
+      container.network.io.usage.rx_dropped:
+        enabled: true
+      container.network.io.usage.tx_dropped:
+        enabled: true
+      container.memory.usage.limit:
+        enabled: true
+      container.memory.usage.total:
+        enabled: true
+      container.blockio.io_service_bytes_recursive:
+        enabled: true
+processors:
+  batch:
+    send_batch_size: 10000
+    send_batch_max_size: 11000
+    timeout: 10s
+  resourcedetection:
+    # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
+    detectors: [env, system]
+    timeout: 2s
+  resourcedetection/docker:
+    detectors: [env, docker]
+    timeout: 2s
+    override: false
+  signozspanmetrics/delta:
+    metrics_exporter: clickhousemetricswrite, signozclickhousemetrics
+    metrics_flush_interval: 60s
+    latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
+    dimensions_cache_size: 100000
+    aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
+    enable_exp_histogram: true
+    dimensions:
+      - name: service.namespace
+        default: default
+      - name: deployment.environment
+        default: default
+      # This is added to ensure the uniqueness of the timeseries
+      # Otherwise, identical timeseries produced by multiple replicas of
+      # collectors result in incorrect APM metrics
+      - name: signoz.collector.id
+      - name: service.version
+      - name: browser.platform
+      - name: browser.mobile
+      - name: k8s.cluster.name
+      - name: k8s.node.name
+      - name: k8s.namespace.name
+      - name: host.name
+      - name: host.type
+      - name: container.name
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  pprof:
+    endpoint: 0.0.0.0:1777
+exporters:
+  clickhousetraces:
+    datasource: tcp://clickhouse:9000/signoz_traces
+    low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
+    use_new_schema: true
+  clickhousemetricswrite:
+    endpoint: tcp://clickhouse:9000/signoz_metrics
+    resource_to_telemetry_conversion:
+      enabled: true
+    disable_v2: true
+  clickhousemetricswrite/prometheus:
+    endpoint: tcp://clickhouse:9000/signoz_metrics
+    disable_v2: true
+  signozclickhousemetrics:
+    dsn: tcp://clickhouse:9000/signoz_metrics
+  clickhouselogsexporter:
+    dsn: tcp://clickhouse:9000/signoz_logs
+    timeout: 10s
+    use_new_schema: true
+  otlp:
+    endpoint: http://otel-collector:4317
+    tls:
+      insecure: true
+  # debug: {}
+service:
+  telemetry:
+    logs:
+      encoding: json
+    metrics:
+      address: 0.0.0.0:8888
+  extensions:
+    - health_check
+    - pprof
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [signozspanmetrics/delta, batch]
+      exporters: [clickhousetraces]
+    metrics/docker:
+      receivers: [docker_stats]
+      processors: [resourcedetection/docker]
+      exporters: [otlp]
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [clickhousemetricswrite, signozclickhousemetrics]
+    metrics/prometheus:
+      receivers: [prometheus]
+      processors: [batch]
+      exporters: [clickhousemetricswrite/prometheus, signozclickhousemetrics]
+    logs:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [clickhouselogsexporter]
--- a/iac/ansible/resources/signoz/config/signoz/otel-collector-opamp-config.yaml
+++ b/iac/ansible/resources/signoz/config/signoz/otel-collector-opamp-config.yaml
@ -0,0 +1 @@
+server_endpoint: ws://signoz:4320/v1/opamp
--- a/iac/ansible/resources/signoz/config/signoz/prometheus.v1.yml
+++ b/iac/ansible/resources/signoz/config/signoz/prometheus.v1.yml
@ -0,0 +1,25 @@
+# my global config
+global:
+  scrape_interval:     5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  # scrape_timeout is set to the global default (10s).
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets:
+      - alertmanager:9093
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files: []
+  # - "first_rules.yml"
+  # - "second_rules.yml"
+  # - 'alerts.yml'
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs: []
+
+remote_read:
+  - url: tcp://clickhouse:9000/signoz_metrics
--- a/iac/ansible/resources/signoz/signoz.yml
+++ b/iac/ansible/resources/signoz/signoz.yml
@ -0,0 +1,243 @@
+version: '3'
+x-common: &common
+  networks:
+    - signoz-net
+  deploy:
+    restart_policy:
+      condition: on-failure
+  logging:
+    options:
+      max-size: 50m
+      max-file: '3'
+x-clickhouse-defaults: &clickhouse-defaults
+  !!merge <<: *common
+  image: clickhouse/clickhouse-server:24.1.2-alpine
+  tty: true
+  user: "1000:1000"
+  deploy:
+    placement:
+      constraints: [node.hostname == manager-node-3]
+    labels:
+      signoz.io/scrape: 'true'
+      signoz.io/port: '9363'
+      signoz.io/path: '/metrics'
+  depends_on:
+    - init-clickhouse
+    - zookeeper-1
+  healthcheck:
+    test:
+      - CMD
+      - wget
+      - --spider
+      - -q
+      - 0.0.0.0:8123/ping
+    interval: 30s
+    timeout: 5s
+    retries: 3
+  ulimits:
+    nproc: 65535
+    nofile:
+      soft: 262144
+      hard: 262144
+x-zookeeper-defaults: &zookeeper-defaults
+  !!merge <<: *common
+  image: bitnami/zookeeper:3.7.1
+  user: root
+  deploy:
+    placement:
+      constraints: [node.hostname == manager-node-1]
+    labels:
+      signoz.io/scrape: 'true'
+      signoz.io/port: '9141'
+      signoz.io/path: '/metrics'
+  healthcheck:
+    test:
+      - CMD-SHELL
+      - curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
+    interval: 30s
+    timeout: 5s
+    retries: 3
+x-db-depend: &db-depend
+  !!merge <<: *common
+  depends_on:
+    - clickhouse
+    - schema-migrator
+services:
+  init-clickhouse:
+    !!merge <<: *common
+    image: clickhouse/clickhouse-server:24.1.2-alpine
+    command:
+      - bash
+      - -c
+      - |
+        version="v0.0.1"
+        node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
+        node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
+        echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
+        cd /tmp
+        wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
+        tar -xvzf histogram-quantile.tar.gz
+        mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
+    deploy:
+      restart_policy:
+        condition: on-failure
+    volumes:
+      - /mnt/cephfs/signoz/data/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts/
+  zookeeper-1:
+    !!merge <<: *zookeeper-defaults
+    # ports:
+    #   - "2181:2181"
+    #   - "2888:2888"
+    #   - "3888:3888"
+    volumes:
+      - /mnt/cephfs/signoz/data/zookeeper-1:/bitnami/zookeeper
+    environment:
+      - ZOO_SERVER_ID=1
+      - ALLOW_ANONYMOUS_LOGIN=yes
+      - ZOO_AUTOPURGE_INTERVAL=1
+      - ZOO_ENABLE_PROMETHEUS_METRICS=yes
+      - ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
+  clickhouse:
+    !!merge <<: *clickhouse-defaults
+    # TODO: needed for clickhouse TCP connectio
+    hostname: clickhouse
+    # ports:
+    #   - "9000:9000"
+    #   - "8123:8123"
+    #   - "9181:9181"
+    configs:
+      - source: clickhouse-config-v1
+        target: /etc/clickhouse-server/config.xml
+      - source: clickhouse-users-v1
+        target: /etc/clickhouse-server/users.xml
+      - source: clickhouse-custom-function-v1
+        target: /etc/clickhouse-server/custom-function.xml
+      - source: clickhouse-cluster-v1
+        target: /etc/clickhouse-server/config.d/cluster.xml
+    volumes:
+      - /mnt/cephfs/signoz/data/clickhouse/data/user_scripts:/var/lib/clickhouse/user_scripts/
+      - /mnt/cephfs/signoz/data/clickhouse/data:/var/lib/clickhouse/
+      # - ../common/clickhouse/storage.xml:/etc/clickhouse-server/config.d/storage.xml
+  signoz:
+    !!merge <<: *db-depend
+    image: signoz/signoz:v0.86.1
+    command:
+      - --config=/root/config/prometheus.yml
+    # ports:
+    # - "8080:8080" # signoz port
+    #   - "6060:6060"     # pprof port
+    configs:
+      - source: signoz-prometheus-config-v1
+        target: /root/config/prometheus.yml
+    volumes:
+      - /mnt/cephfs/signoz/data/dashboards:/root/config/dashboards
+      - /mnt/cephfs/signoz/data/sqlite:/var/lib/signoz/
+    environment:
+      - SIGNOZ_ALERTMANAGER_PROVIDER=signoz
+      - SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
+      - SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
+      - DASHBOARDS_PATH=/root/config/dashboards
+      - STORAGE=clickhouse
+      - GODEBUG=netdns=go
+      - TELEMETRY_ENABLED=true
+      - DEPLOYMENT_TYPE=docker-swarm
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --spider
+        - -q
+        - localhost:8080/api/v1/health
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    networks:
+      - signoz-net
+      - traefik_public
+    deploy:
+      labels:
+        - 'traefik.enable=true'
+        # --- Router für Signoz UI ---
+        - 'traefik.http.routers.signoz.rule=Host(`signoz.genius.ceo`)'
+        - 'traefik.http.routers.signoz.entrypoints=https'
+        - 'traefik.http.routers.signoz.tls.certresolver=main'
+        # --- Service für Signoz UI ---
+        - 'traefik.http.services.signoz.loadbalancer.server.port=8080'
+        # --- Netzwerk für Traefik ---
+        - 'traefik.swarm.network=traefik_public'
+  otel-collector:
+    !!merge <<: *db-depend
+    image: signoz/signoz-otel-collector:v0.111.42
+    user: root
+    command:
+      - --config=/etc/otel-collector-config.yaml
+      - --manager-config=/etc/manager-config.yaml
+      - --copy-path=/var/tmp/collector-config.yaml
+      - --feature-gates=-pkg.translator.prometheus.NormalizeName
+    configs:
+      - source: otel-collector-config-v4
+        target: /etc/otel-collector-config.yaml
+      - source: otel-collector-manager-config-v1
+        target: /etc/manager-config.yaml
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    environment:
+      - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
+      - LOW_CARDINAL_EXCEPTION_GROUPING=false
+    ports:
+      # - "1777:1777"     # pprof extension
+      - '4317:4317' # OTLP gRPC receiver
+      - '4318:4318' # OTLP HTTP receiver
+    deploy:
+      replicas: 3
+      labels:
+        - 'traefik.enable=true'
+        # --- Router für Signoz Collector UI ---
+        - 'traefik.http.routers.signoz-collector.rule=Host(`collector.genius.ceo`)'
+        - 'traefik.http.routers.signoz-collector.entrypoints=https'
+        - 'traefik.http.routers.signoz-collector.tls.certresolver=main'
+        # --- Service für Signoz Collector UI ---
+        - 'traefik.http.services.signoz-collector.loadbalancer.server.port=4318'
+        # --- Netzwerk für Traefik ---
+        - 'traefik.swarm.network=traefik_public'
+    depends_on:
+      - clickhouse
+      - schema-migrator
+      - signoz
+    networks:
+      - signoz-net
+      - traefik_public
+  schema-migrator:
+    !!merge <<: *common
+    image: signoz/signoz-schema-migrator:v0.111.42
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 5s
+    entrypoint: sh
+    command:
+      - -c
+      - '/signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up='
+    depends_on:
+      - clickhouse
+networks:
+  signoz-net:
+    name: signoz-net
+    attachable: true
+  traefik_public:
+    external: true
+configs:
+  otel-collector-config-v4:
+    file: /mnt/cephfs/signoz/config/otel-collector-config.v4.yaml
+  otel-collector-manager-config-v1:
+    file: /mnt/cephfs/signoz/config/signoz/otel-collector-opamp-config.yaml
+  clickhouse-config-v1:
+    file: /mnt/cephfs/signoz/config/clickhouse/config.v1.xml
+  clickhouse-users-v1:
+    file: /mnt/cephfs/signoz/config/clickhouse/users.v1.xml
+  clickhouse-custom-function-v1:
+    file: /mnt/cephfs/signoz/config/clickhouse/custom-function.v1.xml
+  clickhouse-cluster-v1:
+    file: /mnt/cephfs/signoz/config/clickhouse/cluster.v1.xml
+  signoz-prometheus-config-v1:
+    file: /mnt/cephfs/signoz/config/signoz/prometheus.v1.yml
--- a/iac/ansible/roles/authentik/tasks/main.yml
+++ b/iac/ansible/roles/authentik/tasks/main.yml
@ -0,0 +1,33 @@
+---
+# - name: AUTHENTIK | Verzeichnisse erstellen und Berechtigungen setzen
+#   ansible.builtin.file:
+#     path: "/mnt/cephfs/authentik/data/{{ item }}"
+#     state: directory
+#     owner: 1000
+#     group: 1000
+#     mode: '0755'
+#   loop:
+#     - cache
+#     - certs
+#     - db
+#     - media
+#     - templates
+#   run_once: true
+#   delegate_to: "{{ groups['managers'][0] }}"
+
+- name: AUTHENTIK | Generate Compose file
+  ansible.builtin.template:
+    src: docker-compose.yml.j2
+    dest: /mnt/cephfs/authentik/authentik.yml
+    mode: 0644
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: AUTHENTIK | Deploy app stack
+  community.docker.docker_stack:
+    state: present
+    name: authentik
+    compose:
+      - /mnt/cephfs/authentik/authentik.yml
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/authentik/templates/docker-compose.yml.j2
+++ b/iac/ansible/roles/authentik/templates/docker-compose.yml.j2
@ -0,0 +1,100 @@
+---
+networks:
+  traefik_public:
+    external: true
+  internal:
+
+services:
+  postgresql:
+    image: docker.io/library/postgres:16-alpine
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -d $POSTGRES_DB -U $POSTGRES_USER"]
+      start_period: 20s
+      interval: 30s
+      retries: 5
+      timeout: 5s
+    volumes:
+      - /mnt/cephfs/authentik/data/db:/var/lib/postgresql/data
+    environment:
+      POSTGRES_PASSWORD: "{{ pg_pass }}"
+      POSTGRES_USER: "{{ pg_user | default('authentik') }}"
+      POSTGRES_DB: "{{ pg_db | default('authentik') }}"
+    networks:
+      - internal
+
+  redis:
+    image: docker.io/library/redis:alpine
+    command: --save 60 1 --loglevel warning
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "redis-cli ping | grep PONG"]
+      start_period: 20s
+      interval: 30s
+      retries: 5
+      timeout: 3s
+    volumes:
+      - /mnt/cephfs/authentik/data/cache:/data
+    networks:
+      - internal
+
+  server:
+    image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}"
+    restart: unless-stopped
+    command: server
+    environment:
+      AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}"
+      AUTHENTIK_REDIS__HOST: redis
+      AUTHENTIK_POSTGRESQL__HOST: postgresql
+      AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}"
+      AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}"
+      AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}"
+      AUTHENTIK_ERROR_REPORTING__ENABLED: "false"
+    volumes:
+      - /mnt/cephfs/authentik/data/media:/media
+      - /mnt/cephfs/authentik/data/templates:/templates
+    networks:
+      - traefik_public
+      - internal
+    deploy:
+      labels:
+        traefik.enable: "true"
+        traefik.swarm.network: {{ traefik_net }}
+        traefik.http.routers.authentik.rule: Host(`{{ traefik_route }}`) || HostRegexp(`{subdomain:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`)
+        traefik.http.routers.authentik.entrypoints: https
+        traefik.http.routers.authentik.tls: "true"
+        traefik.http.routers.authentik.tls.certresolver: main
+        traefik.http.services.authentik.loadbalancer.server.port: 9000
+        # - "traefik.enable=true"
+        # - "traefik.swarm.network={{ traefik_net }}"
+        # - "traefik.http.routers.authentik.rule=Host(`{{ traefik_route }}`)  || HostRegexp(`{subdomain:[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`)"
+        # - "traefik.http.routers.authentik.entrypoints=https"
+        # - "traefik.http.routers.authentik.tls=true"
+        # - "traefik.http.routers.authentik.tls.certresolver=main"
+        # - "traefik.http.services.authentik.loadbalancer.server.port=9000"
+
+  worker:
+    image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}"
+    restart: unless-stopped
+    command: worker
+    environment:
+      AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}"
+      AUTHENTIK_REDIS__HOST: redis
+      AUTHENTIK_POSTGRESQL__HOST: postgresql
+      AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}"
+      AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}"
+      AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}"
+    # `user: root` and the docker socket volume are optional.
+    # See more for the docker socket integration here:
+    # https://goauthentik.io/docs/outposts/integrations/docker
+    # Removing `user: root` also prevents the worker from fixing the permissions
+    # on the mounted folders, so when removing this make sure the folders have the correct UID/GID
+    # (1000:1000 by default)
+    user: root
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /mnt/cephfs/authentik/data/media:/media
+      - /mnt/cephfs/authentik/data/certs:/certs
+      - /mnt/cephfs/authentik/data/templates:/templates
+    networks:
+      - internal
--- a/iac/ansible/roles/authentik/vars/main.yml
+++ b/iac/ansible/roles/authentik/vars/main.yml
@ -0,0 +1,11 @@
+---
+authentik_image: "ghcr.io/goauthentik/server"
+authentik_tag: "2025.6.3"
+authentik_secret_key: ""
+
+pg_user: "authentik"
+pg_pass: ""
+pg_db: "authentik"
+
+traefik_net: "traefik_public"
+traefik_route: "auth.genius.ceo"
--- a/iac/ansible/roles/ceph_setup/tasks/main.yml
+++ b/iac/ansible/roles/ceph_setup/tasks/main.yml
@ -0,0 +1,93 @@
+---
+- name: CEPH | Private IP des ersten Managers ermitteln
+  ansible.builtin.set_fact:
+    ceph_bootstrap_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}"
+  when: inventory_hostname == groups['managers'][0]
+
+- name: CEPH | Cluster auf dem ersten Manager initialisieren (Bootstrap)
+  ansible.builtin.command:
+    cmd: "cephadm bootstrap --mon-ip {{ ceph_bootstrap_ip }}"
+    creates: /etc/ceph/ceph.conf
+  when: inventory_hostname == groups['managers'][0]
+
+- name: CEPH | Öffentlichen SSH-Schlüssel von cephadm abrufen
+  ansible.builtin.command: "cephadm shell -- ceph cephadm get-pub-key"
+  register: cephadm_pub_key
+  changed_when: false
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | Öffentlichen Schlüssel von cephadm auf allen Knoten für root verteilen
+  ansible.posix.authorized_key:
+    user: root
+    key: "{{ hostvars[groups['managers'][0]]['cephadm_pub_key'].stdout }}"
+    state: present
+    key_options: 'no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
+
+- name: CEPH | Andere Knoten zum Ceph-Cluster hinzufügen
+  ansible.builtin.command:
+    cmd: "ceph orch host add {{ item }} {{ hostvars[item]['ansible_' + private_interface]['ipv4']['address'] }}"
+  loop: "{{ groups['all'] }}"
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | Prüfen, ob bereits OSDs (Speichergeräte) vorhanden sind
+  ansible.builtin.command: "ceph osd ls"
+  register: existing_osds
+  changed_when: false
+  failed_when: false
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | Spezifische Festplatte ({{ ceph_osd_device }}) auf jedem Knoten als OSD hinzufügen
+  ansible.builtin.command: "ceph orch daemon add osd {{ item }}:{{ ceph_osd_device }}"
+  loop: "{{ groups['all'] }}"
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+  when: existing_osds.stdout | length == 0
+
+- name: CEPH | Prüfen, ob CephFS bereits existiert
+  ansible.builtin.command: "ceph fs ls -f json"
+  register: cephfs_list
+  changed_when: false
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | CephFS Pools und Dateisystem erstellen, falls nicht vorhanden
+  block:
+    - name: Metadaten-Pool für CephFS erstellen
+      ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_metadata"
+    - name: Daten-Pool für CephFS erstellen
+      ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_data"
+    - name: CephFS-Dateisystem erstellen
+      ansible.builtin.command: "ceph fs new {{ cephfs_name }} {{ cephfs_name }}_metadata {{ cephfs_name }}_data"
+  when: cephfs_list.stdout | from_json | length == 0
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | Metadaten-Server (MDS) für CephFS starten
+  ansible.builtin.command: "ceph orch apply mds {{ cephfs_name }} --placement=2"
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+  when: cephfs_list.stdout | from_json | length == 0
+
+- name: CEPH | Ceph Admin-Schlüssel für das Mounten abrufen
+  ansible.builtin.command: "ceph auth get-key client.admin"
+  register: ceph_admin_key
+  changed_when: false
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: CEPH | Mount-Punkt für CephFS erstellen
+  ansible.builtin.file:
+    path: /mnt/cephfs
+    state: directory
+    mode: '0755'
+
+- name: CEPH | CephFS auf allen Knoten mounten (und in /etc/fstab eintragen)
+  ansible.posix.mount:
+    path: /mnt/cephfs
+    src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
+    fstype: ceph
+    opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
+    state: mounted
--- a/iac/ansible/roles/common/tasks/main.yml
+++ b/iac/ansible/roles/common/tasks/main.yml
@ -0,0 +1,108 @@
+---
+- name: COMMON | Systempakete aktualisieren und upgraden
+  ansible.builtin.apt:
+    update_cache: true
+    upgrade: dist
+    autoremove: true
+    autoclean: true
+
+- name: COMMON | Notwendige Pakete installieren
+  ansible.builtin.apt:
+    name:
+      - ufw
+      - fail2ban
+      - unattended-upgrades
+      - apt-listchanges
+      - docker-ce
+      - python3-pip
+      - chrony
+      - lvm2
+      - cephadm
+      - ceph-common
+    state: present
+
+- name: COMMON | Chrony Dienst starten und aktivieren
+  ansible.builtin.service:
+    name: chronyd
+    state: started
+    enabled: true
+
+- name: COMMON | Docker Dienst starten und aktivieren
+  ansible.builtin.service:
+    name: docker
+    state: started
+    enabled: true
+
+- name: COMMON | Einen dedizierten Admin-Benutzer erstellen
+  ansible.builtin.user:
+    name: "{{ admin_user }}"
+    password: "{{ admin_password}}"
+    shell: /bin/bash
+    groups: sudo,docker
+    append: true
+    state: present
+
+- name: COMMON | SSH-Schlüssel für den Admin-Benutzer einrichten
+  ansible.posix.authorized_key:
+    user: "{{ admin_user }}"
+    key: "{{ item }}"
+    state: present
+  with_items: "{{ authorized_keys }}"
+
+- name: COMMON | cephadm-Benutzer erstellen
+  ansible.builtin.user:
+    name: "cephadm"
+    password: "{{ cephadm_password }}"
+    shell: /bin/bash
+    groups: sudo,docker
+    append: yes
+    state: present
+
+- name: COMMON | .ssh Verzeichnis für cephadm-Benutzer erstellen
+  ansible.builtin.file:
+    path: /home/cephadm/.ssh
+    state: directory
+
+- name: COMMON | Passwortloses Sudo für cephadm-Benutzer erlauben
+  ansible.builtin.copy:
+    dest: "/etc/sudoers.d/91-cephadm-nopasswd"
+    content: "cephadm ALL=(ALL) NOPASSWD: ALL"
+    mode: '0440'
+    validate: 'visudo -cf %s'
+
+- name: COMMON | ed25519 SSH-Schlüssel für cephadm-Benutzer generieren (nur auf dem ersten Manager)
+  community.crypto.openssh_keypair:
+    path: /home/cephadm/.ssh/id_ed25519
+    type: ed25519
+    owner: cephadm
+    group: cephadm
+    mode: '0600'
+  when: inventory_hostname == groups['managers'][0]
+
+- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm abrufen
+  ansible.builtin.slurp:
+    src: /home/cephadm/.ssh/id_ed25519.pub
+  register: cephadm_ssh_pub_key
+  when: inventory_hostname == groups['managers'][0]
+
+- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm auf allen Knoten verteilen
+  ansible.posix.authorized_key:
+    user: cephadm
+    key: "{{ hostvars[groups['managers'][0]]['cephadm_ssh_pub_key']['content'] | b64decode }}"
+    state: present
+
+- name: COMMON | Automatische Sicherheitsupdates konfigurieren
+  ansible.builtin.copy:
+    src: assets/50unattended-upgrades
+    dest: /etc/apt/apt.conf.d/50unattended-upgrades
+    owner: root
+    group: root
+    mode: '0644'
+
+- name: COMMON | Periodische Auto-Updates aktivieren
+  ansible.builtin.copy:
+    src: assets/20auto-upgrades
+    dest: /etc/apt/apt.conf.d/20auto-upgrades
+    owner: root
+    group: root
+    mode: '0644'
--- a/iac/ansible/roles/docker_swarm/tasks/main.yml
+++ b/iac/ansible/roles/docker_swarm/tasks/main.yml
@ -0,0 +1,58 @@
+---
+- name: SWARM | Ensure Docker SDK for Python is installed
+  ansible.builtin.apt:
+    name: python3-docker
+    state: present
+
+- name: SWARM | Get interface IP address for the manager
+  ansible.builtin.set_fact:
+    manager_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}"
+  when: inventory_hostname == groups['managers'][0]
+
+- name: SWARM | Initialize the Docker Swarm
+  community.docker.docker_swarm:
+    state: present
+    advertise_addr: "{{ manager_ip }}"
+  when: inventory_hostname == groups['managers'][0]
+  register: swarm_init_result
+
+- name: SWARM | Get the join tokens
+  community.docker.docker_swarm_info:
+  register: swarm_info
+  when: inventory_hostname == groups['managers'][0]
+
+- name: SWARM | Verify that join tokens were fetched
+  ansible.builtin.assert:
+    that:
+      - swarm_info is defined
+      - swarm_info.swarm_facts is defined
+      - swarm_info.swarm_facts.JoinTokens.Manager is defined
+      - swarm_info.swarm_facts.JoinTokens.Worker is defined
+    fail_msg: "Konnte die Join-Tokens vom Swarm Manager nicht abrufen. Ist der Swarm korrekt initialisiert?"
+    success_msg: "Join-Tokens erfolgreich abgerufen."
+  when: inventory_hostname == groups['managers'][0]
+
+- name: SWARM | Join manager nodes to the Swarm
+  community.docker.docker_swarm:
+    state: join
+    remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ]
+    join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Manager'] }}"
+  when: inventory_hostname in groups['managers']
+
+- name: SWARM | Join worker nodes to the Swarm
+  community.docker.docker_swarm:
+    state: join
+    remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ]
+    join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Worker'] }}"
+  when: inventory_hostname in groups['workers']
+
+- name: SWARM | Verify Swarm Cluster State (run on manager)
+  ansible.builtin.command: docker node ls
+  register: swarm_nodes
+  changed_when: false
+  when: inventory_hostname == groups['managers'][0]
+
+- name: SWARM | Display cluster state
+  ansible.builtin.debug:
+    msg: "{{ swarm_nodes.stdout_lines }}"
+  when: inventory_hostname == groups['managers'][0]
--- a/iac/ansible/roles/dockge/defaults/main.yml
+++ b/iac/ansible/roles/dockge/defaults/main.yml
@ -0,0 +1,3 @@
+---
+dockge_stacks_dir: /mnt/cephfs/dockge/stacks
+dockge_data_dir: /mnt/cephfs/dockge/data
--- a/iac/ansible/roles/dockge/tasks/main.yml
+++ b/iac/ansible/roles/dockge/tasks/main.yml
@ -0,0 +1,42 @@
+---
+- name: DOCKGE | Copy Stack Files
+  copy:
+    directory_mode: true
+    src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/dockge
+    dest: /mnt/cephfs
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Anwendungs Daten existiert
+#   ansible.builtin.file:
+#     path: "{{ dockge_data_dir }}"
+#     state: directory
+#     owner: root
+#     group: root
+#     mode: '0755'
+#   become: true
+
+# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Stacks existiert
+#   ansible.builtin.file:
+#     path: "{{ dockge_stacks_dir }}"
+#     state: directory
+#     owner: root
+#     group: root
+#     mode: '0755'
+#   become: true
+
+# - name: DOCKGE | Stack aus der Template-Datei bereitstellen
+#   community.docker.docker_stack:
+#     state: present
+#     name: dockge
+#     compose:
+#       - "{{ lookup('template', '../../../resources/dockge/dockge.yml') }}"
+#   delegate_to: "{{ groups['managers'][0] }}"
+#   run_once: true
+- name: DOCKGE | Deploy app stack
+  community.docker.docker_stack:
+    state: present
+    name: dockge
+    compose:
+      - /mnt/cephfs/dockge/dockge.yml
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/fail2ban/tasks/main.yml
+++ b/iac/ansible/roles/fail2ban/tasks/main.yml
@ -0,0 +1,9 @@
+---
+- name: FAIL2BAN | Eine lokale Jail-Konfiguration erstellen
+  ansible.builtin.template:
+    src: jail.local.j2
+    dest: /etc/fail2ban/jail.local
+    owner: root
+    group: root
+    mode: '0644'
+  notify: restart fail2ban
--- a/iac/ansible/roles/fail2ban/templates/jail.local.j2
+++ b/iac/ansible/roles/fail2ban/templates/jail.local.j2
@ -0,0 +1,8 @@
+[DEFAULT]
+bantime  = 1h
+findtime = 10m
+maxretry = 5
+
+[sshd]
+enabled = true
+port = {{ ssh_port }}
--- a/iac/ansible/roles/gitea/defaults/main.yml
+++ b/iac/ansible/roles/gitea/defaults/main.yml
@ -0,0 +1,14 @@
+postgres_version: 16-alpine
+gitea_version: "1.21"
+gitea_domain: "{{ subdomain }}.{{ main_domain }}"
+gitea_http_port: 3000
+gitea_ssh_port: 2222
+
+data_dir: "{{ ceph_volume }}/gitea"
+subdomain: git
+
+gitea_db_type: "postgres"
+gitea_db_host: db:5432
+gitea_db_name: "gitea"
+gitea_db_user: "gitea"
+gitea_db_password: ""
--- a/iac/ansible/roles/gitea/tasks/main.yml
+++ b/iac/ansible/roles/gitea/tasks/main.yml
@ -0,0 +1,38 @@
+- name: GITEA | Ensure data directories
+  ansible.builtin.file:
+    path: '{{ data_dir }}/data'
+    state: directory
+    owner: 1000
+    group: 1000
+    mode: '0750'
+    recurse: yes
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: GITEA | Ensure DB data directories
+  ansible.builtin.file:
+    path: "{{ data_dir }}/data/db"
+    state: directory
+    # Postgres Alpine nutzt UID 70 (postgres). 
+    # Bei Debian-Images wäre es 999.
+    owner: 70 
+    group: 70
+    mode: '0700'
+    recurse: yes
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: GITEA | Generate Compose file
+  ansible.builtin.template:
+    src: docker-compose.yml.j2
+    dest: '{{ data_dir }}/gitea.yml'
+    mode: 0644
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: GITEA | Deploy stack
+  community.docker.docker_stack:
+    state: present
+    name: gitea
+    compose:
+      - '{{ data_dir }}/gitea.yml'
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/gitea/templates/docker-compose.yml.j2
+++ b/iac/ansible/roles/gitea/templates/docker-compose.yml.j2
@ -0,0 +1,62 @@
+networks:
+  {{ traefik_public_net }}:
+    external: true
+  internal:
+
+services:
+  server:
+    image: gitea/gitea:{{ gitea_version }}
+    environment:
+      - USER_UID=1000
+      - USER_GID=1000
+      - GITEA__database__DB_TYPE={{ gitea_db_type }}
+      - GITEA__database__HOST={{ gitea_db_host }}
+      - GITEA__database__NAME={{ gitea_db_name }}
+      - GITEA__database__USER={{ gitea_db_user }}
+      - GITEA__database__PASSWD={{ gitea_db_password }}
+      - GITEA__server__DOMAIN={{ gitea_domain }}
+      - GITEA__server__SSH_DOMAIN={{ gitea_domain }}
+      - GITEA__server__SSH_PORT={{ gitea_ssh_port }}
+      - GITEA__server__ROOT_URL=https://{{ gitea_domain }}/
+    volumes:
+      - {{ data_dir }}/data:/data
+      - /etc/timezone:/etc/timezone:ro
+      - /etc/localtime:/etc/localtime:ro
+    networks:
+      - internal
+      - {{ traefik_public_net }}
+    ports:
+      - "{{ gitea_ssh_port }}:22"
+    deploy:
+      mode: replicated
+      replicas: 1
+      labels:
+        - "traefik.enable=true"
+        - "traefik.docker.network={{ traefik_public_net }}"
+        - "traefik.http.routers.gitea.rule=Host(`{{ gitea_domain }}`)"
+        - "traefik.http.routers.gitea.entrypoints=https"
+        - "traefik.http.routers.gitea.tls.certresolver=main"
+        - "traefik.http.services.gitea.loadbalancer.server.port=3000"
+
+  db:
+    image: postgres:{{ postgres_version }}
+    restart: always
+    environment:
+      - POSTGRES_USER={{ gitea_db_user }}
+      - POSTGRES_PASSWORD={{ gitea_db_password }}
+      - POSTGRES_DB={{ gitea_db_name }}
+    networks:
+      - internal
+    volumes:
+      - {{ data_dir }}/data/db:/var/lib/postgresql/data
+    command: 
+      - "postgres"
+      - "-c"
+      - "fsync=on"
+      - "-c"
+      - "full_page_writes=on"
+      - "-c"
+      - "synchronous_commit=on"
+    deploy:
+      mode: replicated
+      replicas: 1
--- a/iac/ansible/roles/kestra/tasks/main.yml
+++ b/iac/ansible/roles/kestra/tasks/main.yml
@ -0,0 +1,47 @@
+---
+- name: KESTRA | Ensure data directory
+  ansible.builtin.file:
+    path: '{{ data_dir }}/data/data'
+    state: directory
+    mode: '0755'
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: KESTRA | Ensure db directory
+  ansible.builtin.file:
+    path: '{{ data_dir }}/data/db'
+    state: directory
+    mode: '0755'
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: KESTRA | Konfigurationsdatei für tmpfiles.d erstellen
+  ansible.builtin.copy:
+    content: "d /tmp/kestra-wd 0755 root root -"
+    dest: /etc/tmpfiles.d/kestra-wd.conf
+    owner: root
+    group: root
+    mode: '0644'
+  
+- name: KESTRA | Create Kestra working directory
+  ansible.builtin.file:
+    path: /tmp/kestra-wd
+    state: directory
+    mode: '0755'
+
+- name: KESTRA | Generate Compose file
+  ansible.builtin.template:
+    src: docker-compose.yml.j2
+    dest: '{{ data_dir }}/kestra.yml'
+    mode: 0644
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: KESTRA | Deploy stack
+  community.docker.docker_stack:
+    state: present
+    name: kestra
+    compose:
+      - /mnt/cephfs/kestra/kestra.yml
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/kestra/templates/docker-compose.yml.j2
+++ b/iac/ansible/roles/kestra/templates/docker-compose.yml.j2
@ -0,0 +1,92 @@
+networks:
+  internal:
+  {{ traefik_public_net }}:
+    external: true
+
+services:
+  postgres:
+    image: postgres:17
+    volumes:
+      - {{ data_dir }}/data/db:/var/lib/postgresql/data
+    environment:
+      POSTGRES_DB: {{ kestra.db.name }}
+      POSTGRES_USER: {{ kestra.db.user }}
+      POSTGRES_PASSWORD: "{{ kestra.db.pass }}"
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -d '$${POSTGRES_DB}' -U $${POSTGRES_USER}"]
+      interval: 30s
+      timeout: 10s
+      retries: 10
+    networks:
+      - internal
+    deploy:
+      mode: replicated
+      replicas: 1
+
+  kestra:
+    image: kestra/kestra:v0.24.2
+    entrypoint: /bin/bash
+    # Note that this is meant for development only. Refer to the documentation for production deployments of Kestra which runs without a root user.
+    user: "root"
+    command:
+      - -c
+      - /app/kestra server standalone --worker-thread=128
+    volumes:
+      - {{ data_dir }}/data/data:/app/storage
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /tmp/kestra-wd:/tmp/kestra-wd
+    environment:
+      KESTRA_CONFIGURATION: |
+        datasources:
+          postgres:
+            url: jdbc:postgresql://postgres:5432/kestra
+            driverClassName: org.postgresql.Driver
+            username: {{ kestra.db.user }}
+            password: {{ kestra.db.pass }}
+        kestra:
+          tutorialFlows:
+            enabled: false
+          traces:
+            root: DEFAULT
+          micronaut:
+            metrics:
+              export:
+                otlp:
+                  enabled: true
+                  url: http://signoz_otel-collector:4318/v1/metrics
+          otel:
+            traces:
+              exporter: otlp
+            exporter:
+              otlp:
+                endpoint: http://signoz_otel-collector:4318
+          server:
+            basic-auth:
+              username: {{ kestra.basic_auth.user }}
+              password: {{ kestra.basic_auth.pass }} 
+          repository:
+            type: postgres
+          storage:
+            type: local
+            local:
+              base-path: "/app/storage"
+          queue:
+            type: postgres
+          tasks:
+            tmp-dir:
+              path: /tmp/kestra-wd/tmp
+          url: http://localhost:8080/
+    networks:
+      - {{ traefik_public_net }}
+      - internal
+    deploy:
+      mode: replicated
+      replicas: 1
+      labels:
+        - "traefik.enable=true"
+        - "traefik.swarm.network={{ traefik_public_net }}"
+        - "traefik.http.routers.kestra.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
+        - "traefik.http.routers.kestra.entrypoints=https"
+        - "traefik.http.routers.kestra.tls=true"
+        - "traefik.http.routers.kestra.tls.certresolver=main"
+        - "traefik.http.services.kestra.loadbalancer.server.port=8080"
--- a/iac/ansible/roles/kestra/vars/main.yml
+++ b/iac/ansible/roles/kestra/vars/main.yml
@ -0,0 +1,11 @@
+subdomain: kestra
+data_dir: "{{ ceph_volume }}/kestra"
+
+kestra:
+  basic_auth:
+    user: "ma@coachhamburg.com"
+    pass: "igyozi9B87yTeiQ6z2sbe8Y4aQLJV58jdaCNu"
+  db:
+    name: kestra
+    user: kestra
+    pass: ""
--- a/iac/ansible/roles/monitoring/tasks/main.yml
+++ b/iac/ansible/roles/monitoring/tasks/main.yml
@ -0,0 +1,13 @@
+---
+- name: Copy Stack Files
+  copy:
+    directory_mode: true
+    src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
+    dest: /srv
+- block:
+    - name: Deploy Monitoring stack
+      community.docker.docker_stack:
+        state: present
+        name: monitoring
+        compose:
+          - /srv/monitoring/observability.yml
--- a/iac/ansible/roles/portainer/tasks/main.yml
+++ b/iac/ansible/roles/portainer/tasks/main.yml
@ -0,0 +1,25 @@
+---
+- name: PORTAINER | Ensure data directories
+  ansible.builtin.file:
+    path: '{{ data_dir }}/data'
+    state: directory
+    mode: '0755'
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: PORTAINER | Generate Compose file
+  ansible.builtin.template:
+    src: docker-compose.yml.j2
+    dest: '{{ data_dir }}/portainer.yml'
+    mode: 0644
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: PORTAINER | Deploy stack
+  community.docker.docker_stack:
+    state: present
+    name: portainer
+    compose:
+      - '{{ data_dir }}/portainer.yml'
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/portainer/templates/docker-compose.yml.j2
+++ b/iac/ansible/roles/portainer/templates/docker-compose.yml.j2
@ -0,0 +1,37 @@
+version: '3.2'
+
+services:
+  agent:
+    image: portainer/agent:{{ portainer_version }}
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /var/lib/docker/volumes:/var/lib/docker/volumes
+    networks:
+      - {{ traefik_public_net }}
+    deploy:
+      mode: global
+      placement:
+        constraints: [node.platform.os == linux]
+
+  portainer:
+    image: portainer/portainer-ce:{{ portainer_version }}
+    command: -H tcp://portainer_agent:9001 --tlsskipverify
+    volumes:
+      - {{ data_dir }}/data:/data
+    networks:
+      - {{ traefik_public_net }}
+    deploy:
+      mode: replicated
+      replicas: 1
+      labels:
+        - "traefik.enable=true"
+        - "traefik.swarm.network={{ traefik_public_net }}"
+        - "traefik.http.routers.portainer.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
+        - "traefik.http.routers.portainer.entrypoints=https"
+        - "traefik.http.routers.portainer.tls=true"
+        - "traefik.http.routers.portainer.tls.certresolver=main"
+        - "traefik.http.services.portainer.loadbalancer.server.port=9000"
+
+networks:
+  {{ traefik_public_net }}:
+    external: true
--- a/iac/ansible/roles/portainer/vars/main.yml
+++ b/iac/ansible/roles/portainer/vars/main.yml
@ -0,0 +1,4 @@
+subdomain: port
+data_dir: "{{ ceph_volume }}/portainer"
+
+portainer_version: 2.33.5
--- a/iac/ansible/roles/signoz-infra/tasks/main.yml
+++ b/iac/ansible/roles/signoz-infra/tasks/main.yml
@ -0,0 +1,18 @@
+---
+- name: Copy Stack Files
+  copy:
+    directory_mode: true
+    src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz-infra
+    dest: /mnt/cephfs
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: Deploy Signoz Infra stack
+  community.docker.docker_stack:
+    state: present
+    name: signoz-infra
+    prune: true
+    compose:
+      - /mnt/cephfs/signoz-infra/signoz-infra.yml
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/signoz/tasks/main.yml
+++ b/iac/ansible/roles/signoz/tasks/main.yml
@ -0,0 +1,18 @@
+---
+- name: Copy Stack Files
+  copy:
+    directory_mode: true
+    src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz
+    dest: /mnt/cephfs
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
+
+- name: Deploy Signoz stack
+  community.docker.docker_stack:
+    state: present
+    name: signoz
+    prune: true
+    compose:
+      - /mnt/cephfs/signoz/signoz.yml
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/ssh_hardening/tasks/main.yml
+++ b/iac/ansible/roles/ssh_hardening/tasks/main.yml
@ -0,0 +1,30 @@
+---
+- name: SSH | Ensure privilege separation directory exists
+  ansible.builtin.file:
+    path: /run/sshd
+    state: directory
+    mode: '0755'
+
+- name: SSH | Root-Login nur mit Schlüssel erlauben
+  ansible.builtin.lineinfile:
+    path: /etc/ssh/sshd_config
+    regexp: '^#?PermitRootLogin'
+    line: 'PermitRootLogin prohibit-password'
+    validate: 'sshd -t -f %s'
+  notify: restart sshd
+
+- name: SSH | Passwort-Authentifizierung deaktivieren
+  ansible.builtin.lineinfile:
+    path: /etc/ssh/sshd_config
+    regexp: '^#?PasswordAuthentication'
+    line: 'PasswordAuthentication no'
+    validate: 'sshd -t -f %s'
+  notify: restart sshd
+
+- name: SSH | Leere Passwörter verbieten
+  ansible.builtin.lineinfile:
+    path: /etc/ssh/sshd_config
+    regexp: '^#?PermitEmptyPasswords'
+    line: 'PermitEmptyPasswords no'
+    validate: 'sshd -t -f %s'
+  notify: restart sshd
--- a/iac/ansible/roles/traefik/files/traefik/config/dynamic/http.middlewares.authentik.toml
+++ b/iac/ansible/roles/traefik/files/traefik/config/dynamic/http.middlewares.authentik.toml
@ -0,0 +1,18 @@
+[http]
+  [http.middlewares]
+    [http.middlewares.authentik.forwardAuth]
+      address = "http://authentik_server:9000/outpost.goauthentik.io/auth/traefik"
+      trustForwardHeader = true
+      authResponseHeaders = [
+        "X-authentik-username",
+        "X-authentik-groups",
+        "X-authentik-email",
+        "X-authentik-name",
+        "X-authentik-uid",
+        "X-authentik-jwt",
+        "X-authentik-meta-jwks",
+        "X-authentik-meta-outpost",
+        "X-authentik-meta-provider",
+        "X-authentik-meta-app",
+        "X-authentik-meta-version"
+      ]
--- a/iac/ansible/roles/traefik/files/traefik/config/traefik.toml
+++ b/iac/ansible/roles/traefik/files/traefik/config/traefik.toml
@ -0,0 +1,80 @@
+[global]
+  checkNewVersion = true
+  sendAnonymousUsage = false
+
+[experimental]
+  otlpLogs = true
+
+[core]
+  defaultRuleSyntax = "v2"
+
+[accessLog]
+  filePath = "/logs/access.log"
+  format = "json"
+
+# Enable the Dashboard
+[api]
+  dashboard = true
+
+# Write out Traefik logs
+[log]
+  level = "INFO"
+  format = "json"
+  filePath = "/logs/traefik.log"
+  # [log.otlp.http]
+  #   endpoint = "http://signoz_otel-collector:4318/v1/logs"
+
+[entryPoints]
+  [entryPoints.http]
+    address = ":80"
+    [entryPoints.http.http.redirections.entryPoint]
+      to = "https"
+      scheme = "https"
+
+  [entryPoints.https]
+    address = ":443"
+    # [entryPoints.https.http.tls]
+    #   certResolver = "main"
+
+# OTel
+# [tracing]
+#   serviceName = "traefik"
+#   [tracing.otlp.http]
+#     endpoint = "http://signoz_otel-collector:4318/v1/traces"
+#     [tracing.otlp.http.tls]
+#       insecureSkipVerify =  true
+
+# # Metrics
+# [metrics]
+#   addInternals = false
+#   [metrics.otlp]
+#     serviceName = "traefik"
+#     addEntryPointsLabels = true
+#     addRoutersLabels = true
+#     addServicesLabels = true
+#     [metrics.otlp.http]
+#       endpoint = "http://signoz_otel-collector:4318/v1/metrics"
+    # [metrics.otlp.grpc]
+    #   endpoint = "monitoring_alloy:4317"
+    #   insecure = true
+
+# Let's Encrypt
+[certificatesResolvers.main.acme]
+  email = "ma@coachhamburg.com"
+  storage = "acme.json"
+  # uncomment to use staging CA for testing
+  # caServer = "https://acme-staging-v02.api.letsencrypt.org/directory"
+  # [certificatesResolvers.main.acme.tlsChallenge]
+  [certificatesResolvers.main.acme.dnsChallenge]
+    provider = "digitalocean"
+  # Uncomment to use HTTP validation, like a caveman!
+  # [certificatesResolvers.main.acme.httpChallenge]
+  #     entryPoint = "http"
+
+[providers]
+  [providers.swarm]
+    endpoint = "unix:///var/run/docker.sock"
+    exposedByDefault = false
+  [providers.file]
+    directory = "/etc/traefik/dynamic"
+    watch = true
--- a/iac/ansible/roles/traefik/files/traefik/data/acme.json
+++ b/iac/ansible/roles/traefik/files/traefik/data/acme.json
--- a/iac/ansible/roles/traefik/tasks/main.yml
+++ b/iac/ansible/roles/traefik/tasks/main.yml
@ -0,0 +1,44 @@
+---
+- name: TRAEFIK | Copy Stack Files
+  copy:
+    directory_mode: true
+    src: traefik
+    dest: "{{ ceph_volume }}"
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: TRAEFIK | Generate Compose file
+  ansible.builtin.template:
+    src: docker-compose.yml.j2
+    dest: "{{ data_dir }}/traefik.yml"
+    mode: 0644
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: TRAEFIK | Ensure permissions on acme.json
+  ansible.builtin.file:
+    path: "{{ data_dir }}/data/acme.json"
+    mode: '0600'
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: TRAEFIK | traefik_public Netzwerk erstellen
+  community.docker.docker_network:
+    name: traefik_public
+    driver: overlay
+    state: present
+    attachable: yes
+    ipam_config:
+      - subnet: '172.16.200.0/24'
+        gateway: '172.16.200.1'
+  run_once: true
+  delegate_to: "{{ groups['managers'][0] }}"
+
+- name: TRAEFIK | Deploy app stack
+  community.docker.docker_stack:
+    state: present
+    name: traefik
+    compose:
+      - "{{ data_dir }}/traefik.yml"
+  delegate_to: "{{ groups['managers'][0] }}"
+  run_once: true
--- a/iac/ansible/roles/traefik/templates/docker-compose.yml.j2
+++ b/iac/ansible/roles/traefik/templates/docker-compose.yml.j2
@ -0,0 +1,55 @@
+services:
+  app:
+    image: traefik:{{ traefik_version }}
+    ports:
+      - target: 80
+        published: 80
+        protocol: tcp
+        mode: host
+      - target: 443
+        published: 443
+        protocol: tcp
+        mode: host
+      - target: 8080
+        published: 8080
+        protocol: tcp
+    environment:
+#      - HETZNER_API_TOKEN={{ hetzner_api_key }}
+      - DO_AUTH_TOKEN={{ do_api_key }}
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - {{ data_dir }}/config:/etc/traefik
+      - {{ data_dir }}/data/logs:/logs
+      - {{ data_dir }}/data/acme.json:/acme.json
+    # healthcheck:
+    #   test: ["CMD", "traefik", "healthcheck", "--ping"]
+    #   timeout: 1s
+    #   interval: 10s
+    #   retries: 3
+    #   start_period: 10s
+    networks:
+      - {{ traefik_public_net }}
+    # Global mode makes an instance of traefik listen on _every_ node, so that regardless of which
+    # node the request arrives on, it'll be forwarded to the correct backend service.
+    deploy:
+      mode: global
+      labels:
+        - "traefik.enable=true"
+        - "traefik.swarm.network={{ traefik_public_net }}"
+        - "traefik.http.routers.api.rule=Host(`{{ subdomain }}.{{ main_domain }}`) && (PathPrefix(`/api`) || PathPrefix(`/dashboard`))"
+        - "traefik.http.routers.api.entrypoints=https"
+{% if use_authentik %}
+        - "traefik.http.routers.api.middlewares=authentik@file"
+{% endif %}
+        - "traefik.http.routers.api.tls.domains[0].main={{ main_domain }}"
+        - "traefik.http.routers.api.tls.domains[0].sans=*.{{ main_domain }}"
+        - "traefik.http.routers.api.tls=true"
+        - "traefik.http.routers.api.tls.certresolver=main"
+        - "traefik.http.routers.api.service=api@internal"
+        - "traefik.http.services.dummy.loadbalancer.server.port=9999"
+      placement:
+        constraints: [node.role == manager]
+
+networks:
+  {{ traefik_public_net }}:
+    external: true
--- a/iac/ansible/roles/traefik/vars/main.yml
+++ b/iac/ansible/roles/traefik/vars/main.yml
@ -0,0 +1,5 @@
+subdomain: router
+use_authentik: true
+data_dir: "{{ ceph_volume }}/traefik"
+
+traefik_version: v3.6.2
--- a/iac/ansible/roles/ufw_firewall/tasks/main.yml
+++ b/iac/ansible/roles/ufw_firewall/tasks/main.yml
@ -0,0 +1,84 @@
+---
+- name: FIREWALL | UFW auf Standardeinstellungen zurücksetzen
+  community.general.ufw:
+    state: reset
+
+- name: FIREWALL | Standardmäßig allen ausgehenden Traffic erlauben
+  community.general.ufw:
+    direction: outgoing
+    policy: allow
+
+- name: FIREWALL | Standardmäßig allen eingehenden Traffic blockieren
+  community.general.ufw:
+    direction: incoming
+    policy: deny
+
+- name: FIREWALL | Eingehenden SSH-Traffic auf öffentlichem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "{{ ssh_port }}"
+    proto: tcp
+    interface: "{{ public_interface }}"
+    direction: in
+
+- name: FIREWALL | Eingehenden SSH-Traffic auf privatem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "{{ ssh_port }}"
+    proto: tcp
+    interface: "{{ private_interface }}"
+    direction: in
+
+- name: FIREWALL | Eingehenden HTTP/HTTPS-Traffic auf öffentlichem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "{{ item.port }}"
+    proto: "{{ item.proto }}"
+    interface: "{{ public_interface }}"
+    direction: in
+  with_items:
+    - { port: '80', proto: 'tcp' }
+    - { port: '443', proto: 'tcp' }
+
+- name: FIREWALL | Ceph Monitor Ports auf privatem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "{{ item }}"
+    proto: tcp
+    interface: "{{ private_interface }}"
+    direction: in
+  with_items:
+    - '3300'
+    - '6789'
+
+- name: FIREWALL | Ceph OSD/MGR Port-Range auf öffentlichem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "6800:7568"
+    proto: tcp
+    interface: "{{ private_interface }}"
+    direction: in
+
+- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "2377"
+    proto: tcp
+    interface: "{{ private_interface }}"
+    direction: in
+
+- name: FIREWALL | Docker Swarm Discovery/Network Ports auf privatem Interface erlauben
+  community.general.ufw:
+    rule: allow
+    port: "{{ item.port }}"
+    proto: "{{ item.proto }}"
+    interface: "{{ private_interface }}"
+    direction: in
+  with_items:
+    - { port: '7946', proto: 'tcp' }
+    - { port: '7946', proto: 'udp' }
+    - { port: '4789', proto: 'udp' }
+
+- name: FIREWALL | UFW aktivieren
+  community.general.ufw:
+    state: enabled
--- a/iac/cluster/Pulumi.infra-base.yaml
+++ b/iac/cluster/Pulumi.infra-base.yaml
@ -0,0 +1,3 @@
+config:
+  hcloud:token:
+    secure: AAABAHkvxBXaEbrikY6bNyuwXehFp71LvsHTT2LOYHLiAaRCil5cSODn1EktYTYL+f4ryGJtN1j/wiyrAkbZBnyVC1QnSb84tTLYeKYXBtHo2fY87vReuyOwFZbFGylC
--- a/iac/cluster/Pulumi.yaml
+++ b/iac/cluster/Pulumi.yaml
@ -0,0 +1,9 @@
+name: gc-infra
+description: A minimal Go Pulumi program
+runtime: go
+config:
+  pulumi:tags:
+    value:
+      pulumi:template: go
+  # hcloud:token:
+  #   value: xqb89P4vF2YlBjU75AAtyoQzNvTHaXyhB0J2UYR8dAmEQDKz5GWeKO7KgEyPzUu5
--- a/iac/cluster/main.go
+++ b/iac/cluster/main.go
@ -16,7 +16,7 @@ import (
 type Infrastructure struct {
 	placementGroup *hcloud.PlacementGroup
 	networkID      *pulumi.IDOutput
-	masterNodes    []*hcloud.Server
+	managerNodes   []*hcloud.Server
 	workerNodes    []*hcloud.Server
 }

@ -55,30 +55,31 @@ func main() {
 			panic(err.Error())
 		}

-		infra.masterNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
+		infra.managerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
 			PlacementGroupId: infra.placementGroup.ID(),
 			NetworkId:        infra.networkID,
 			NetworkFirstIP:   string(utils.IncrementIP(net.ParseIP("10.0.1.0"))),
-			Basename:         "master-node",
-			Count:            1,
-			SshKey:           hkey,
-		})
-		if err != nil {
-			panic(err.Error())
-		}
-		infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
-			PlacementGroupId: infra.placementGroup.ID(),
-			NetworkId:        infra.networkID,
-			NetworkFirstIP:   string(utils.IncrementIP(net.ParseIP("10.0.1.20"))),
-			Basename:         "worker-node",
-			Count:            2,
+			Basename:         "manager-node",
+			Count:            3,
 			SshKey:           hkey,
+			ServerType:       "ccx23",
 		})
 		if err != nil {
 			panic(err.Error())
 		}
+		// infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
+		// 	PlacementGroupId: infra.placementGroup.ID(),
+		// 	NetworkId:        infra.networkID,
+		// 	NetworkFirstIP:   string(utils.IncrementIP(net.ParseIP("10.0.1.20"))),
+		// 	Basename:         "worker-node",
+		// 	Count:            2,
+		// 	SshKey:           hkey,
+		// })
+		// if err != nil {
+		// 	panic(err.Error())
+		// }

-		for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) {
+		for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) {
 			err := utils.InstallAnsibleDependencies(ctx, remote.ConnectionArgs{
 				Host:       s.Ipv4Address,
 				User:       pulumi.String("root"),
@ -89,22 +90,28 @@ func main() {
 			}
 		}

-		var advAddr = infra.masterNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string {
-			return *net[0].Ip
-		}).(pulumi.StringOutput)
+		// var advAddr = infra.managerNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string {
+		// 	return *net[0].Ip
+		// }).(pulumi.StringOutput)

-		tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{
-			Host:       infra.masterNodes[0].Ipv4Address,
-			User:       pulumi.String("root"),
-			PrivateKey: pk.PrivateKeyOpenssh}, advAddr)
-		if err != nil {
-			panic(err.Error())
-		}
+		// tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{
+		// 	Host:       infra.managerNodes[0].Ipv4Address,
+		// 	User:       pulumi.String("root"),
+		// 	PrivateKey: pk.PrivateKeyOpenssh}, advAddr)
+		// if err != nil {
+		// 	panic(err.Error())
+		// }

-		ctx.Export("SwarmTokens", tokens)
+		// ctx.Export("SwarmTokens", tokens)
+
+		// inventory, err := utils.CreateAnsibleInventory(infra.managerNodes, infra.workerNodes)
+		// if err != nil {
+		// 	panic(err.Error())
+		// }
+		// ctx.Export("inventory", inventory)

 		sm := map[string]pulumi.Input{}
-		for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) {
+		for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) {
 			sm[fmt.Sprintf("node-%d-ip", idx)] = s.Ipv4Address
 		}
 		ctx.Export("server-ips", pulumi.Map(sm))
--- a/iac/cluster/utils/ansible-configuration.go
+++ b/iac/cluster/utils/ansible-configuration.go
@ -1,11 +1,14 @@
 package utils

 import (
+	"bytes"
 	"fmt"
 	"regexp"
 	"strings"
+	"text/template"

 	"github.com/pulumi/pulumi-command/sdk/go/command/remote"
+	"github.com/pulumi/pulumi-hcloud/sdk/go/hcloud"
 	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
 )

@ -14,6 +17,11 @@ type SwarmJoinTokens struct {
 	WorkerToken  string
 }

+type ServerInfo struct {
+	Name pulumi.StringOutput
+	IP   pulumi.StringOutput
+}
+
 func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionArgs, uniqueness string) error {
 	_, err := remote.NewCommand(ctx, strings.Join([]string{uniqueness, "Install Ansible Dependencies"}, ": "),
 		&remote.CommandArgs{
@ -26,7 +34,7 @@ func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionA
 	return nil
 }

-func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.StringOutput, error) {
+func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.Output, error) {
 	var tokens SwarmJoinTokens

 	fullCommand := advertiseAddr.ApplyT(func(addr string) *string {
@ -44,17 +52,103 @@ func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advert
 		return pulumi.StringOutput{}, err
 	}

-	return out.Stdout.ApplyT(func(output string) string {
+	return out.Stdout.ApplyT(func(output string) SwarmJoinTokens {
 		searchWorker := "Worker Token: "
-		pattern := regexp.MustCompile(searchWorker + `(\S+)`)
+		patternWorker := regexp.MustCompile(searchWorker + `(\S+)`)
+		searchManager := "Manager Token: "
+		patternManager := regexp.MustCompile(searchManager + `(\S+)`)

-		matches := pattern.FindStringSubmatch(output)
+		matches := patternWorker.FindStringSubmatch(output)
 		if len(matches) > 1 {
 			extracted := matches[1]
 			tokens.WorkerToken = extracted
-			return extracted
 		}
-		fmt.Println(tokens.WorkerToken)
-		return ""
-	}).(pulumi.StringOutput), nil
+		matches = patternManager.FindStringSubmatch(output)
+		if len(matches) > 1 {
+			extracted := matches[1]
+			tokens.ManagerToken = extracted
+		}
+		return tokens
+	}), nil
+}
+
+func CreateAnsibleInventory(managerNodes, workerNodes []*hcloud.Server) (pulumi.Output, error) {
+	serverInfos := toServerInfo(managerNodes)
+	return pulumi.All(pulumi.ToOutput(serverInfos)).ApplyT(func(results []interface{}) (string, error) {
+		var serverInfos = results[0].([]ServerInfo)
+		// var workerSlice = results[1].([]*hcloud.Server)
+
+		serverData := make(map[string][]ServerInfo)
+
+		for _, s := range serverInfos {
+			serverData["Manager"] = append(serverData["Manager"], ServerInfo{
+				Name: s.Name,
+				IP:   s.IP,
+			})
+		}
+		// for _, result := range workerSlice {
+		// 	server := result.(map[string]interface{})
+		// 	serverData["Worker"] = append(serverData["Worker"], ServerInfo{
+		// 		Name: server["name"].(string),
+		// 		IP:   server["ipv4_address"].(string),
+		// 	})
+		// }
+		fmt.Println(serverData["Manager"])
+		fmt.Println(results[0])
+		return generateInventoryFile(serverData)
+	}).(pulumi.Output), nil
+}
+
+func toServerInfo(server []*hcloud.Server) pulumi.ArrayOutput {
+	serverInfo := []ServerInfo{}
+	for _, s := range server {
+		serverInfo = append(serverInfo, ServerInfo{
+			Name: s.Name,
+			IP:   s.Ipv4Address,
+		})
+	}
+	return pulumi.All(serverInfo).ApplyT(func(args []interface{}) []interface{} {
+		var serverInfo []interface{}
+
+		for _, s := range args {
+			val := s.(map[string]interface{})
+			serverInfo = append(serverInfo, map[string]interface{}{
+				"Name": val["Name"].(string),
+				"IP":   val["IP"].(string),
+			})
+		}
+		return serverInfo
+	}).(pulumi.ArrayOutput)
+}
+
+func generateInventoryFile(inventory map[string][]ServerInfo) (string, error) {
+	const inventoryTmpl = `
+[all]
+{{ range .Manager }}
+{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
+{{ end }}
+{{ range .Worker }}
+{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
+{{ end }}
+
+[manager]
+{{ range .Manager }}
+{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
+{{ end }}
+
+[worker]
+{{ range .Worker }}
+{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
+{{ end }}
+	`
+	tmpl, err := template.New("inventory").Parse(inventoryTmpl)
+	if err != nil {
+		return "", err
+	}
+	var buf bytes.Buffer
+	err = tmpl.Execute(&buf, inventory)
+	if err != nil {
+		return "", err
+	}
+	return buf.String(), nil
 }
--- a/iac/cluster/utils/resource-creation.go
+++ b/iac/cluster/utils/resource-creation.go
@ -54,6 +54,7 @@ type CreateServerArgs struct {
 	Basename       string
 	Count          int
 	SshKey         *hcloud.SshKey
+	ServerType     string
 }

 func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server, error) {
@ -64,9 +65,8 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server,
 		s, err := hcloud.NewServer(ctx, sn, &hcloud.ServerArgs{
 			Name:       pulumi.String(sn),
 			Image:      pulumi.String("docker-ce"),
-			ServerType: pulumi.String("cpx21"),
-			Location:   pulumi.StringPtr("fsn1"),
-			// Datacenter: pulumi.StringPtr("fsn1"),
+			ServerType: pulumi.String(cfg.ServerType),
+			Location:   pulumi.StringPtr("hel1"),
 			Networks: hcloud.ServerNetworkTypeArray{
 				&hcloud.ServerNetworkTypeArgs{
 					NetworkId: IDtoIntOutput(cfg.NetworkId),
@ -85,6 +85,24 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server,
 		if err != nil {
 			return nodes, err
 		}
+
+		cephVolume, err := hcloud.NewVolume(ctx, fmt.Sprintf("ceph-%s", sn), &hcloud.VolumeArgs{
+			Name:     pulumi.Sprintf("%s-ceph-vol-0%d", s.Name, i+1),
+			Size:     pulumi.Int(100),
+			Location: s.Location,
+		})
+		if err != nil {
+			return nodes, fmt.Errorf("couldn't create volume: %w", err)
+		}
+
+		_, err = hcloud.NewVolumeAttachment(ctx, fmt.Sprintf("ceph-vol-attach-%s", sn), &hcloud.VolumeAttachmentArgs{
+			VolumeId: IDtoIntOutput(cephVolume.ID()),
+			ServerId: IDtoIntOutput(s.ID()),
+		})
+		if err != nil {
+			return nodes, fmt.Errorf("couldn't attach volume to node %d", i)
+		}
+
 		nodes = append(nodes, s)
 		nextIp = IncrementIP(net.ParseIP(nextIp)).String()
 	}