Analysis of Results

Author

clown

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
print("Libraries loaded successfully.")

Libraries loaded successfully.

# load results.csv into pandas dataframe

import os
import urllib.request

# check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# download data if in Colab
if IN_COLAB:
    print("Running in Google Colab - downloading data...")
    os.makedirs("llm_bug_analysis/results", exist_ok=True)
    try:
        urllib.request.urlretrieve(
            "https://raw.githubusercontent.com/engemkeres/llm-analysis-thesis/main/llm_bug_analysis/results/results.csv",
            "llm_bug_analysis/results/results.csv"
        )
        print("Data downloaded successfully from GitHub.")
    except Exception as e:
        print(f"Error downloading data: {e}")
        print("Please upload results.csv manually.")

# load the data
try:
    df = pd.read_csv("llm_bug_analysis/results/results.csv")
    print("Data loaded successfully. First 5 rows:")
    display(df.head())
    print("\nData columns and types:")
    df.info()
except FileNotFoundError:
    print("ERROR: results.csv not found.")
    if IN_COLAB:
        print("The automatic download may have failed. Please upload the file manually using:")
        print("from google.colab import files")
        print("uploaded = files.upload()")
    else:
        print("Make sure you have run the analysis pipeline first.")

Data loaded successfully. First 5 rows:

	timestamp	repo_name	bug_commit_sha	file_path	commit_message	issue_title	issue_body	llm_model	complexity_before_cc	complexity_before_cognitive	...	complexity_after_llm_avg_params	complexity_after_llm_total_tokens	human_tests_passed	human_lines_added	human_lines_deleted	human_total_diff	complexity_after_human_cc	complexity_after_human_cognitive	complexity_after_human_avg_params	complexity_after_human_total_tokens
0	2025-10-11T15:02:54.981034	mahmoud/boltons	4815fc8dd1768da5f2d903846d2ab994aa57b0cf	NaN	Test and fix for #348 (#349)	LRU .values() and dict return old entries	Hi,\r\n\r\nFirst of all thanks for the excelle...	manual_llm	198	135	...	SKIPPED	SKIPPED	True	44	9	53	199	135	1.68	5014

1 rows × 29 columns


Data columns and types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 29 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   timestamp                            1 non-null      object 
 1   repo_name                            1 non-null      object 
 2   bug_commit_sha                       1 non-null      object 
 3   file_path                            0 non-null      float64
 4   commit_message                       1 non-null      object 
 5   issue_title                          1 non-null      object 
 6   issue_body                           1 non-null      object 
 7   llm_model                            1 non-null      object 
 8   complexity_before_cc                 1 non-null      int64  
 9   complexity_before_cognitive          1 non-null      int64  
 10  complexity_before_avg_params         1 non-null      float64
 11  complexity_before_total_tokens       1 non-null      int64  
 12  llm_patch_applied                    1 non-null      object 
 13  llm_tests_passed                     1 non-null      object 
 14  ai_lines_added                       1 non-null      object 
 15  ai_lines_deleted                     1 non-null      object 
 16  ai_total_diff                        1 non-null      object 
 17  complexity_after_llm_cc              1 non-null      object 
 18  complexity_after_llm_cognitive       1 non-null      object 
 19  complexity_after_llm_avg_params      1 non-null      object 
 20  complexity_after_llm_total_tokens    1 non-null      object 
 21  human_tests_passed                   1 non-null      bool   
 22  human_lines_added                    1 non-null      int64  
 23  human_lines_deleted                  1 non-null      int64  
 24  human_total_diff                     1 non-null      int64  
 25  complexity_after_human_cc            1 non-null      int64  
 26  complexity_after_human_cognitive     1 non-null      int64  
 27  complexity_after_human_avg_params    1 non-null      float64
 28  complexity_after_human_total_tokens  1 non-null      int64  
dtypes: bool(1), float64(3), int64(9), object(16)
memory usage: 357.0+ bytes

# convert llm tests passed into numeric type

df_ai = df[df['llm_tests_passed'] != 'SKIPPED'].copy()
if df_ai.empty:
    print("No LLM results found in the data. Skipping AI success rate calculation.")
else:
    # this part only runs if there is data
    df_ai['llm_success'] = df_ai['llm_tests_passed'].astype(bool)
    overall_success_rate = df_ai['llm_success'].mean()
    print(f"Overall LLM Success Rate (where attempted): {overall_success_rate:.2%}")

No LLM results found in the data. Skipping AI success rate calculation.

# group data by repo and calculate success rate for each

if df_ai.empty:
    print("No LLM results found in the data. Skipping the repository summary plot.")
else:
    # this part only runs if there is data about the llm fix
    success_by_repo = df_ai.groupby('repo_name')['llm_success'].mean().sort_values(ascending=False)

    print("LLM Success Rate by Repository:")
    print(success_by_repo.apply('{:.2%}'.format))

    # Create a bar chart to visualize this.
    plt.figure(figsize=(10, 6))
    success_by_repo.plot(kind='bar')
    plt.title('LLM Test Pass Rate by Repository')
    plt.ylabel('Success Rate')
    plt.xlabel('Repository')
    plt.xticks(rotation=45, ha='right')
    plt.gca().yaxis.set_major_formatter('{:.0%}'.format)
    plt.tight_layout()
    plt.show()

No LLM results found in the data. Skipping the repository summary plot.