Commit 7fedc3ce authored by Gaurav Kumar's avatar Gaurav Kumar
Browse files

restructure code

parent f2936027
#!/bin/sh
python3 repositories_visualizer.py
\ No newline at end of file
import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, MinHashLSH
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType
import utils as u
def countEach(column: str, new_column: str):
"""
Counts each object in the column `column` ("dependencies" or "mcrTags") of the file in `df`
and saves the results in a new file with the column `new_column`.
"""
u.delete_dir(u.spark_dir)
df = u.read_csv(spark, u.output_dir + u.repos_with + column + ".csv")
df = df.withColumn(column, F.split(
F.regexp_replace(column, "[\[\]]", ""), ","))
list = []
for row in df.rdd.collect():
for i in row[column]:
list.append(i)
data = spark.createDataFrame(list, StringType()).toDF(column) \
.groupBy(column).count().withColumnRenamed(column, new_column)
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.output_dir + column + "_counted.csv")
def countSets(column: str):
"""
Counts each set in the column `column` ("dependencies" or "mcrTags") of the file in `df`
and saves the results in a new file.
"""
u.delete_dir(u.spark_dir)
df = u.read_csv(spark, u.output_dir + u.repos_with + column + ".csv")
df = df.groupBy(column).count()
u.write_csv(df.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.output_dir + column + "_sets_counted.csv")
def computeJaccardSimilarity(column: str, threshold: float):
"""
Computes the Jaccard similarity with `threshold` on the column `column` ("dependencies" or "mcrTags")
of the file in `df` and saves the results in a new file.
"""
u.delete_dir(u.spark_dir)
df = u.read_csv(spark, u.output_dir + u.repos_with + column + ".csv")
df = df.withColumn(column, F.split(
F.regexp_replace(column, "[\[\]]", ""), ","))
model = Pipeline(stages=[
HashingTF(inputCol=column, outputCol="vectors"),
MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=10)
]).fit(df)
data_t = model.transform(df)
data_s = model.stages[-1].approxSimilarityJoin(
data_t, data_t, 1 - threshold, distCol="similarity")
result = data_s.withColumn("intersection", F.array_intersect(
F.col("datasetA." + column), F.col("datasetB." + column)).cast("string")) \
.select(F.col("datasetA.repositoryName").alias("repositoryName1"),
F.col("datasetB.repositoryName").alias("repositoryName2"),
F.col("intersection"), F.col("similarity")) \
.filter("repositoryName1 < repositoryName2") \
.withColumn("similarity", F.round(1 - F.col("similarity"), 2))
u.write_csv(result.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.output_dir +
u.repos_with + column + "_similarity.csv")
def countPairs():
"""
Creates all dependency pairs with dependencies (count >= 100) in the file in `df2`,
counts the occurrences of each dependency pair in the file in `df1` and saves the
results in a new file.
"""
u.delete_dir(u.spark_dir)
df1 = u.read_csv(spark, u.output_dir + u.repos_with +
u.dependencies + ".csv")
df1 = df1.withColumn(u.dependencies, F.split(
F.regexp_replace(u.dependencies, "[\[\]]", ""), ","))
df2 = u.read_csv(spark, u.output_dir + u.dependencies + "_counted.csv")
df2 = df2.filter(F.col("count") >= 100)
pairs = df2.select(F.col("dependency").alias("dependency1")) \
.crossJoin(df2.select(F.col("dependency").alias("dependency2"))) \
.filter("dependency1 < dependency2")
counted = pairs.join(df1, F.array_contains(df1[u.dependencies], pairs["dependency1"]) &
F.array_contains(df1[u.dependencies], pairs["dependency2"])) \
.groupBy("dependency1", "dependency2").count().drop("repositoryName").drop(u.dependencies)
df3 = df2.withColumnRenamed("dependency", "dependency1") \
.withColumnRenamed("count", "count1")
df4 = df2.withColumnRenamed("dependency", "dependency2") \
.withColumnRenamed("count", "count2")
data = counted.join(df3, "dependency1").join(df4, "dependency2") \
.select("dependency1", "dependency2", "count", "count1", "count2")
data = data.withColumn("proportion1", F.round(data["count"] / data["count1"], 2)) \
.withColumn("proportion2", F.round(data["count"] / data["count2"], 2)) \
.withColumn("maxProportion", F.greatest(F.col("proportion1"), F.col("proportion2")))
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.output_dir +
u.dependencies + "_pairs_counted.csv")
if __name__ == "__main__":
sc = SparkContext("local", "applying-apis-victor")
spark = SparkSession(sc)
countEach("dependencies", "dependency")
countSets("dependencies")
countSets("mcrTags")
computeJaccardSimilarity("dependencies", 0.7)
computeJaccardSimilarity("mcrTags", 0.7)
countPairs()
This diff is collapsed.
from pathlib import Path
import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
import utils as u
def analyze_repository(repository: str, abstraction: str):
"""
Analyzes the `repository` on the abstraction level `abstraction` ("package", "class", "method").
Creates a `proportion_file` and a `sets_file`.
"""
proportion_file = u.analyzed_data_dir + abstraction + "/" + \
u.api_proportion_file + abstraction + "_" + repository + ".csv"
sets_file = u.analyzed_data_dir + abstraction + "/" + \
u.api_sets_file + abstraction + "_" + repository + ".csv"
if Path(proportion_file).exists() & Path(sets_file).exists():
print("Analysis for " + repository + " already exists.")
return None
data_file = u.data_dir + repository + ".csv"
if not Path(data_file).exists():
print("Skipping " + repository + ", " + data_file + " missing.")
return None
df1 = u.read_csv(spark, data_file)
if (df1.rdd.isEmpty()):
print("Skipping " + repository + ", " + data_file + " is empty.")
return None
df1 = df1.filter(F.col(u.isAPIClass) == "true").na.fill("")
grouping_columns = [u.packageName]
if abstraction == "method":
grouping_columns.extend([u.className, u.methodName])
elif abstraction == "class":
grouping_columns.append(u.className)
grouping_columns_api = grouping_columns.copy()
grouping_columns_api.extend([u.api, u.mcrCategories, u.mcrTags])
counted_all = df1.groupBy(grouping_columns) \
.agg(F.count(u.api).alias(u.countAll))
counted = df1.groupBy(grouping_columns_api) \
.agg(F.count(u.api).alias(u.count))
joined = counted.join(counted_all, grouping_columns, "outer") \
.withColumn(u.proportion, F.round(F.col(u.count) / F.col(u.countAll), 2))
u.delete_dir(u.spark_dir)
u.write_csv(joined.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, proportion_file)
df2 = u.read_csv(spark, proportion_file)
df2 = df2.groupBy(grouping_columns).agg(F.regexp_replace(
F.sort_array(F.collect_list(u.api)).cast("string"), " ", "").alias(u.apis))
u.delete_dir(u.spark_dir)
u.write_csv(df2.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, sets_file)
def analyze(repositories_selected_file: str):
"""
Analyzes all repositories in the `repositories_selected_file` on all abstraction levels
("package", "class", "method").
"""
df = u.read_csv(spark, repositories_selected_file)
for repository in df.select("repositoryName").rdd.flatMap(lambda x: x).collect():
analyze_repository(repository.replace("/", "_"), "method")
analyze_repository(repository.replace("/", "_"), "class")
analyze_repository(repository.replace("/", "_"), "package")
def count_pairs_in_abstraction_and_repo(repository: str, abstraction: str, dependency1: str, dependency2: str):
"""
Counts the occurrences of the dependencies `dependency1` and `dependency2` in the `repository`
on the abstraction level `abstraction`. Returns a list that contains `repository`,
`count_dependency1`, `count_dependency2`, and `count_both`.
"""
sets_file = u.analyzed_data_dir + abstraction + "/" + \
u.api_sets_file + abstraction + "_" + \
repository.replace("/", "_") + ".csv"
if not Path(sets_file).exists():
print("Skipping " + repository + ", " + sets_file + " missing.")
return []
df = u.read_csv(spark, sets_file)
df = df.withColumn(u.apis, F.split(
F.regexp_replace(u.apis, "[\[\]]", ""), ","))
count_both = df.filter(F.array_contains(u.apis, dependency1) &
F.array_contains(u.apis, dependency2)).count()
count_dependency1 = df.filter(
F.array_contains(u.apis, dependency1)).count()
count_dependency2 = df.filter(
F.array_contains(u.apis, dependency2)).count()
return [repository, count_dependency1, count_dependency2, count_both]
def count_pairs_in_abstraction(abstraction: str, repositories_selected_file: str, dependency1: str, dependency2: str):
"""
Counts the occurrences of the dependencies `dependency1` and `dependency2` in all repositories
contained in the `repositories_selected_file` on the abstraction level `abstraction`. Writes
them in a CSV file. Returns a list that contains `abstraction`,
`count_dependency1`, `count_dependency2`, and `count_both`.
"""
counts = []
df = u.read_csv(spark, repositories_selected_file)
for repository in df.select("repositoryName").rdd.flatMap(lambda x: x).collect():
count = count_pairs_in_abstraction_and_repo(
repository, abstraction, dependency1, dependency2)
if count != []:
counts.append(count)
schema = StructType([StructField("repositoryName", StringType(), False),
StructField(dependency1, IntegerType(), False),
StructField(dependency2, IntegerType(), False),
StructField("both", IntegerType(), False)])
data = spark.createDataFrame(counts, schema)
u.delete_dir(u.spark_dir)
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.analyzed_data_dir + "counts" + "/" + dependency1.replace(":", "_")
+ "_" + dependency2.replace(":", "_") + "_" + abstraction + ".csv")
count_dependency1 = data.select(
F.sum(F.col("`" + dependency1 + "`"))).collect()[0][0]
count_dependency2 = data.select(
F.sum(F.col("`" + dependency2 + "`"))).collect()[0][0]
count_both = data.select(F.sum(F.col("both"))).collect()[0][0]
return [abstraction, count_dependency1, count_dependency2, count_both]
def count_pairs(repositories_selected_file: str, dependency1: str, dependency2: str):
"""
Counts the occurrences of the dependencies `dependency1` and `dependency2` in all repositories
contained in the `repositories_selected_file` on all abstraction levels ("package", "class", "method").
"""
all_counts = []
all_counts.append(count_pairs_in_abstraction(
"method", repositories_selected_file, dependency1, dependency2))
all_counts.append(count_pairs_in_abstraction(
"class", repositories_selected_file, dependency1, dependency2))
all_counts.append(count_pairs_in_abstraction(
"package", repositories_selected_file, dependency1, dependency2))
schema = StructType([StructField("abstraction", StringType(), False),
StructField(dependency1, IntegerType(), False),
StructField(dependency2, IntegerType(), False),
StructField("both", IntegerType(), False)])
data = spark.createDataFrame(all_counts, schema)
u.delete_dir(u.spark_dir)
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.analyzed_data_dir + "counts" + "/" + "counted_" + dependency1.replace(":", "_")
+ "_" + dependency2.replace(":", "_") + ".csv")
def sample_abstractions(repositories: int, samples_in_repository: int, usage_limit: int, abstraction: str, dependency1: str, dependency2: str):
"""
Samples (number:)`repositories` repositories that have (number:)`usage_limit` abstractions of type `abstraction`
that contain the dependencies `dependency1` and `dependency2`. From each repository (number:)`samples_in_repository`
abstractions are sampled.
"""
sample = []
df = u.read_csv(spark, u.analyzed_data_dir + "counts" + "/" + dependency1.replace(":", "_")
+ "_" + dependency2.replace(":", "_") + "_" + abstraction + ".csv")
count = df.count()
df = df.filter(F.col("both") >= usage_limit).select("repositoryName")
count_filtered = df.count()
print("Filtered file", count_filtered, "file of", count,
"repositories with at least", usage_limit, "abstractions that use both dependencies.")
print("Sampled", repositories, "repositories file of", count_filtered)
for repository in sc.parallelize(df.rdd.takeSample(False, repositories)).map(tuple).collect():
data = u.read_csv(spark, u.analyzed_data_dir + abstraction + "/" + u.api_sets_file +
abstraction + "_" + repository[0].replace("/", "_") + ".csv")
data = data.withColumn(u.apis, F.split(
F.regexp_replace(u.apis, "[\[\]]", ""), ",")) \
.filter(F.array_contains(u.apis, dependency1) &
F.array_contains(u.apis, dependency2)) \
.withColumn("repositoryName", F.lit(repository[0])).na.fill("")
sample_list = sc.parallelize(
data.rdd.takeSample(False, samples_in_repository)).map(tuple).collect()
for t in sample_list:
sample.append(t)
schema = StructType([StructField(u.filePath, StringType(), False),
StructField(u.packageName, StringType(), False)])
if abstraction == "method":
schema.add(StructField(u.className, StringType(), False))
schema.add(StructField(u.methodName, StringType(), False))
elif abstraction == "class":
schema.add(StructField(u.className, StringType(), False))
schema.add(StructField(u.apis, StringType(), False))
schema.add(StructField("repositoryName", StringType(), False))
data = spark.createDataFrame(sample, schema) \
.withColumn(u.apis, F.regexp_replace(u.apis, " ", ""))
u.delete_dir(u.spark_dir)
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.analyzed_data_dir + "sampled_abstractions" + "/"
+ "sampled_abstractions_" + dependency1.replace(":", "_") + "_"
+ dependency2.replace(":", "_") + ".csv")
if __name__ == "__main__":
sc = SparkContext("local", "applying-apis-victor")
spark = SparkSession(sc)
dependency1 = "org.apache.lucene:lucene-analyzers-common"
dependency2 = "org.apache.lucene:lucene-core"
repositories_selected_file = u.repositories_selected_dir + \
dependency1.replace(":", "_") + "_" + \
dependency2.replace(":", "_") + ".csv"
analyze(repositories_selected_file)
count_pairs(repositories_selected_file, dependency1, dependency2)
sample_abstractions(5, 2, 5, "method", dependency1, dependency2)
import plotly.express as px
import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
import utils as u
import pandas as pd
import numpy as np
import logging
def get_abstraction_grouping_columns(abstraction: str):
"""
Gets the columns `packageName`, `className`, and `methodName` depending
on the `abstraction` level ("package", "class", "method").
"""
if abstraction == "method":
return [u.packageName, u.className, u.methodName]
elif abstraction == "class":
return [u.packageName, u.className]
elif abstraction == "package":
return [u.packageName]
def get_api_grouping_columns(characterization_type: str):
"""
Gets the columns `api`, `mcrCategories`, and `mcrTags` depending
on the `characterization_type` ("api", "mcrCategories", "mcrTags").
"""
if characterization_type == u.api:
return [characterization_type, u.mcrCategories, u.mcrTags]
return [u.mcrCategories, u.mcrTags]
def getDependenceString(apply_dependence: bool):
"""
Returns "_with_dep" if `apply_dependence` is true.
"""
if apply_dependence:
return "_with_dep"
return ""
def characterize_abstractions(repository: str, abstraction: str, characterization_type: str, apply_dependence: bool):
"""
Characterizes the `abstractions` of the `repository` with the `characterization_type` and the
dependence relationships if `apply_dependence` is true.
"""
abstr_group_cols = get_abstraction_grouping_columns(abstraction)
api_group_cols = abstr_group_cols.copy()
api_group_cols.extend(
get_api_grouping_columns(characterization_type))
df1 = u.read_csv(spark, u.data_dir + repository + ".csv") \
.select(abstr_group_cols) \
.fillna(".").distinct()
df2 = u.read_csv(spark, u.analyzed_data_dir + abstraction + "/" +
u.api_proportion_file + abstraction + "_" + repository + ".csv") \
.fillna(".", abstr_group_cols)
# print('df2:', u.analyzed_data_dir + abstraction + "/" +
# u.api_proportion_file + abstraction + "_" + repository + ".csv")
if characterization_type == u.mcrTags:
df2 = df2.withColumn(u.mcrTags, F.split(
F.regexp_replace(u.mcrTags, "[\[\]]", ""), ",")) \
.withColumn(u.mcrTags, F.explode(u.mcrTags))
if apply_dependence:
df = u.read_csv(spark, u.analyzed_data_dir + abstraction + "/" + u.api_sets_file +
abstraction + "_" + repository + ".csv") \
.fillna(".", abstr_group_cols) \
.withColumn(u.apis, F.split(
F.regexp_replace(u.apis, "[\[\]]", ""), ",")) \
.withColumn("dependence1",
F.when(F.array_contains(u.apis, dependency1) &
F.array_contains(u.apis, dependency5), True).otherwise(False)) \
.withColumn("dependence2",
F.when(F.array_contains(u.apis, dependency2) &
F.array_contains(u.apis, dependency5), True).otherwise(False)) \
.withColumn("dependence3",
F.when(F.array_contains(u.apis, dependency3) &
F.array_contains(u.apis, dependency4), True).otherwise(False)) \
.withColumn("dependence4",
F.when(F.array_contains(u.apis, dependency4) &
F.array_contains(u.apis, dependency6), True).otherwise(False))
if characterization_type == u.mcrTags:
df2 = df2.join(df, (df2[u.packageName] == df[u.packageName]) &
(df2[u.className] == df[u.className]) &
(df2[u.methodName] == df[u.methodName]) &
((df["dependence1"] & df2[u.mcrTags].isin(dependence1tags)) |
(df["dependence2"] & df2[u.mcrTags].isin(dependence2tags)) |
(df["dependence3"] & df2[u.mcrTags].isin(dependence3tags)) |
(df["dependence4"] & df2[u.mcrTags].isin(dependence3tags))), "leftanti")
else:
df2 = df2.join(df, (df2[u.packageName] == df[u.packageName]) &
(df2[u.className] == df[u.className]) &
(df2[u.methodName] == df[u.methodName]) &
((df["dependence1"] & (df2[u.api] == dependency1)) |
(df["dependence2"] & (df2[u.api] == dependency2)) |
(df["dependence3"] & (df2[u.api] == dependency3))), "leftanti")
df2 = df2.select(api_group_cols).filter(
F.col(characterization_type).isNotNull())
data = df1.join(df2, abstr_group_cols, how="left").fillna("none")
u.delete_dir(u.spark_dir)
u.write_csv(data.coalesce(1), u.spark_dir)
u.copy_csv(u.spark_dir, u.characterization_dir + u.characterization_file +
repository + "_" + abstraction + "_" + characterization_type + getDependenceString(apply_dependence) + ".csv")
# print(u.spark_dir, u.characterization_dir + u.characterization_file +
# repository + "_" + abstraction + "_" + characterization_type + getDependenceString(apply_dependence) + ".csv")
# print(u.spark_dir)
def visualize(repository: str, abstraction: str, characterization_type: str, apply_dependence: bool):
"""
Visualizes the `repository` using the `abstractions` that were characterized with the
`characterization_type` and the dependence relationships if `apply_dependence` is true.
"""
data = u.read_csv(spark, u.characterization_dir + u.characterization_file + repository + "_" +
abstraction + "_" + characterization_type + getDependenceString(apply_dependence) + ".csv")
data = data.toPandas()
path_cols = get_abstraction_grouping_columns(abstraction)
path_cols.append(characterization_type)
fig = px.treemap(
data, path=path_cols, hover_data=get_api_grouping_columns(characterization_type), color=characterization_type,
color_discrete_sequence=["burlywood", "mediumaquamarine", "deepskyblue", "yellow",
"lavender", "pink", "#c37c4a", "gold", "sandybrown", "silver", "gray"],
color_discrete_map={"(?)": "#ffd695", "none": "#66b35d", "junit:junit": "#4c8ed4", "Testing Frameworks": "#4c8ed4", "testing": "#4c8ed4",
"org.mockito:mockito-core": "#e45756", "Mocking": "#e45756", "mock": "#e45756",
"org.hamcrest:hamcrest-all": "greenyellow", "matching": "greenyellow",
"org.apache.lucene:lucene-core": "orchid", "Full-Text Indexing Libraries": "orchid", "lucene": "orchid",
"org.apache.lucene:lucene-analyzers-common": "aqua"})
fig.write_image(u.visualization_dir + u.visualization_file + repository + "_" + abstraction +
"_" + characterization_type + getDependenceString(apply_dependence) + ".pdf", width=1500, height=1000)
def api_probability_in_mcrcategories(df):
dom_list_mcrcat = df['mcrCategories'].replace(to_replace='none', value=np.nan).dropna()
df.mcrCategories.unique()
mcr_cat_arr_name = np.unique(dom_list_mcrcat)
series_mcr_arr = {}
series_mcr_count = {}
for mcr_category in mcr_cat_arr_name:
series_mcr = df['api'][df.index[df['mcrCategories'] == mcr_category]]
series_mcr_arr[mcr_category] = series_mcr.to_numpy()
series_mcr_count[mcr_category] = len(series_mcr.to_numpy())
print('----------------------------------------------------------------------------------------------------------------')
print(
'----------------------------------------------------------------------------------------------------------------')
for key, value in series_mcr_arr.items():
unique_api = {}
for api in value:
if api not in unique_api:
unique_api[api] = 1
else:
unique_api[api] = unique_api[api] + 1
total = 0
for ikey, ivalue in unique_api.items():
total += ivalue
for ikey, ivalue in unique_api.items():
prob = ivalue / total
print(f"The probability of api '{ikey}' having mcr category '{key}': {prob}")
lmsg = "The probability of api ", ikey, " having mcr category ", key, ": ", prob
logging.info(lmsg)
print('----------------------------------------------------------------------------------------------------------------')
def calculate_dominant_mcrcategories():
df = pd.read_csv(r'file/characterization/characterization_Novetta_CLAVIN_method_api.csv', engine='python')
df['mcrCategories'].value_counts()
dom_list = df['mcrCategories'].replace(to_replace='none', value=np.nan).dropna().value_counts()
df_mcr_count = pd.DataFrame({'mcr category': dom_list.index, 'count': dom_list.values})
percent_arr = {}
percent_array = []
i = 0
while i < len(df_mcr_count):
percent = df_mcr_count['count'][i] / df_mcr_count["count"].sum() * 100
percent_arr[df_mcr_count['mcr category'][i]] = round(percent, 2)
percent_array.append(round(percent, 2))
i = i + 1
df_mcr_count['percent'] = percent_array