#!/bin/bash

# Get the number of GPUs in the system
num_gpus=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits --id=0)

# Convert throttle reason to numeric value
convert_throttle_reason() {
  case "$1" in
    "Active") echo "1" ;;
    "NotActive") echo "0" ;;
    "[N/A]") echo "-1" ;;
    *) echo "-1" ;; # Default case
  esac
}

numerical_value_check() {
  case "$1" in
    "[N/A]") echo "0" ;;
    "N/A") echo "0" ;;
    *) echo "$1" ;; # Default case
  esac
}

# Convert pstate to numeric value
convert_pstate() {
  echo "${1//P/}" # Remove 'P' and use the remaining number
}

# Function to output JSON safely
json_value() {
  if [[ $2 == "string" ]]; then
    printf '%s' "\"$1\""
  else
    printf '%s' "$1"
  fi
}

# Start the JSON object
echo -n "{"

# GPU information array
echo -n "\"gpus\": {"

# Iterate over all GPUs and collect metrics for each separately
for (( gpu_index=0; gpu_index<num_gpus; gpu_index++ ))
do
    # Query GPU metrics for the current GPU
    output=$(nvidia-smi --query-gpu=utilization.gpu,temperature.gpu,temperature.memory,fan.speed,power.draw,power.limit,clocks.current.video,clocks.current.memory,clocks.current.sm,clocks.current.graphics,utilization.memory,memory.free,memory.used,memory.total,pstate,power.draw.instant --id=$gpu_index --format=csv,noheader,nounits)

    # Query the throttle reasons for the current GPU
    throttle_output=$(nvidia-smi --query-gpu=clocks_throttle_reasons.hw_slowdown,clocks_throttle_reasons.gpu_idle,clocks_throttle_reasons.applications_clocks_setting,clocks_throttle_reasons.sw_power_cap,clocks_throttle_reasons.hw_thermal_slowdown,clocks_throttle_reasons.hw_power_brake_slowdown,clocks_throttle_reasons.sw_thermal_slowdown,clocks_throttle_reasons.sync_boost --id=$gpu_index --format=csv,noheader,nounits)
    compute_running_process=$(nvidia-smi --query-compute-apps=used_gpu_memory --id=$gpu_index --format=csv,noheader,nounits)
    number_active_processes=$(nvidia-smi --query-compute-apps=pid --id=$gpu_index --format=csv,noheader,nounits | wc -l)
    IFS=',' read -r utilization_gpu temperature_gpu temperature_memory fan_speed power_draw power_limit clocks_current_video clocks_current_memory clocks_current_sm clocks_current_graphics utilization_memory memory_free memory_used memory_total pstate power_draw_instant <<< "$output"
    IFS=',' read -r hw_slowdown gpu_idle applications_clocks_setting sw_power_cap hw_thermal_slowdown hw_power_brake_slowdown sw_thermal_slowdown sync_boost <<< "$throttle_output"

    fan_speed=$(numerical_value_check "${fan_speed// /}")
    temperature_memory=$(numerical_value_check "${temperature_memory// /}")
    # Check if the variable is empty and set it to zero if it is
    if [ -z "$compute_running_process" ]; then
        compute_running_process=0
    fi
    # Convert floating-point metrics to long integers by truncating the decimal part
    power_draw=$(printf "%.0f" "$power_draw")
    power_limit=$(printf "%.0f" "$power_limit")
    power_draw_instant=$(printf "%.0f" "$power_draw_instant")
    clocks_current_video=$(printf "%.0f" "$clocks_current_video")
    clocks_current_memory=$(printf "%.0f" "$clocks_current_memory")
    clocks_current_sm=$(printf "%.0f" "$clocks_current_sm")
    clocks_current_graphics=$(printf "%.0f" "$clocks_current_graphics")

    # Calculate the power draw percentage if power_limit is greater than zero to avoid division by zero
    if [ "$power_limit" -gt 0 ]; then
        power_draw_pct=$(awk "BEGIN {printf \"%.0f\", ($power_draw/$power_limit)*100}")
    else
        power_draw_pct="N/A"
    fi

    # Convert throttle reasons to numeric values
    hw_slowdown=$(convert_throttle_reason "${hw_slowdown// /}")
    gpu_idle=$(convert_throttle_reason "${gpu_idle// /}")
    applications_clocks_setting=$(convert_throttle_reason "${applications_clocks_setting// /}")
    sw_power_cap=$(convert_throttle_reason "${sw_power_cap// /}")
    hw_thermal_slowdown=$(convert_throttle_reason "${hw_thermal_slowdown// /}")
    hw_power_brake_slowdown=$(convert_throttle_reason "${hw_power_brake_slowdown// /}")
    sw_thermal_slowdown=$(convert_throttle_reason "${sw_thermal_slowdown// /}")
    sync_boost=$(convert_throttle_reason "${sync_boost// /}")

    # Convert pstate to numeric value
    pstate_numeric=$(convert_pstate "${pstate// /}")

    # Run nvidia-smi dmon to collect 2 samples of FB memory and PCIe throughput for the first GPU
    readarray -t metrics < <(nvidia-smi dmon -i ${gpu_index} -s mt -c 2 | grep -E '^[[:space:]]*[0-9]+')

    # Extract the last sample collected (the first one could be 0 if it's captured right after the command starts)
    last_sample=${metrics[-1]}

    # Read into individual variables
    read -r gpu fb_memory_used bar1_memory_used rx_throughput tx_throughput <<< "$last_sample"

    # Calculate FB memory used percentage
    fb_memory_used_pct=$(awk "BEGIN {printf \"%.0f\", ($fb_memory_used/(($memory_total)*1.04859))*100}")


  # Output the GPU information in a similar format to the provided JSON
  echo -n "\"gpu$gpu_index\": {"
  echo -n "\"utilization\": $(json_value $utilization_gpu),"
  echo -n "\"temperature\": $(json_value $temperature_gpu),"
  echo -n "\"fan_speed\": $(json_value $fan_speed),"
  echo -n "\"power\": {"
  echo -n "\"draw\": $(json_value $power_draw),"
  echo -n "\"limit\": $(json_value $power_limit),"
  echo -n "\"draw_pct\": $(json_value $power_draw_pct),"
  echo -n "\"draw_instant\": $(json_value $power_draw_instant)"
  echo -n "},"
  echo -n "\"clocks\": {"
  echo -n "\"video\": $(json_value $clocks_current_video),"
  echo -n "\"memory\": $(json_value $clocks_current_memory),"
  echo -n "\"sm\": $(json_value  $clocks_current_sm),"
  echo -n "\"graphics\": $(json_value $clocks_current_graphics)"
  echo -n "},"
  echo -n "\"memory\": {"
  echo -n "\"utilization\": $(json_value $utilization_memory),"
  echo -n "\"free\": $(json_value $memory_free),"
  echo -n "\"used\": $(json_value $memory_used),"
  echo -n "\"total\": $(json_value $memory_total),"
  echo -n "\"temperature\":$(json_value $temperature_memory)"
  echo -n "},"
  echo -n "\"framebuffer_memory\": {"
  echo -n "\"used\": $(json_value $fb_memory_used),"
  echo -n "\"total\": $(json_value $memory_total),"
  echo -n "\"used_pct\": $(json_value $fb_memory_used_pct)"
  echo -n "},"
  echo -n "\"bar1_memory\": {"
  echo -n "\"used\": $(json_value $bar1_memory_used)"
  echo -n "},"
  echo -n "\"processes\": {"
  echo -n "\"count\": $(json_value $number_active_processes),"
  echo -n "\"memory_used\": $(json_value $compute_running_process)"
  echo -n "},"
  echo -n "\"throttle_reasons\": {"
  echo -n "\"hw_slowdown\": $(json_value $hw_slowdown),"
  echo -n "\"gpu_idle\": $(json_value $gpu_idle),"
  echo -n "\"applications_clocks_setting\": $(json_value $applications_clocks_setting),"
  echo -n "\"sw_power_cap\": $(json_value $sw_power_cap),"
  echo -n "\"hw_thermal_slowdown\": $(json_value $hw_thermal_slowdown),"
  echo -n "\"hw_power_brake_slowdown\": $(json_value $hw_power_brake_slowdown),"
  echo -n "\"sw_thermal_slowdown\": $(json_value $sw_thermal_slowdown),"
  echo -n "\"sync_boost\": $(json_value $sync_boost)"
  echo -n "},"
  echo -n "\"performance\": {"
  echo -n "\"rx_throughput\": $(json_value $rx_throughput),"
  echo -n "\"tx_throughput\": $(json_value $tx_throughput),"
  echo -n "\"pstate\": $(json_value $pstate_numeric)"
  echo -n "}}"

  # Add a comma between GPU objects if there are more GPUs to process
  if [[ $((gpu_index + 1)) -lt num_gpus ]]; then
      echo -n ","
  fi
done

# Close the GPU information array
echo -n "}"

# Close the JSON object
echo "}"

