Commit a1c605f6 authored by Gregor Zimmermann's avatar Gregor Zimmermann

up to date version

parent 212d94c3

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

G Zim,DESKTOP-RVN2OU3/Admin,DESKTOP-RVN2OU3,26.03.2020 23:28,file:///C:/Users/Admin/AppData/Roaming/OpenOffice/4;
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (untitled2)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/untitled2.iml" filepath="$PROJECT_DIR$/.idea/untitled2.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RSettings" path="C:\Program Files\R\R-3.6.2\bin\R.exe" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
This diff is collapsed.
import threading
import pandas as pd
import os
import re
import time
from random import random
file_usage = threading.Semaphore(value=1)
def prep_func(raw_file, path):
raw_str = pre_clean(raw_file)
accumulated_vec = vectorize(raw_str)
file_path = raw_file.split('\\')
accumulated_vec['Repo']=file_path[-2]
accumulated_vec['FileName']=file_path[-1].split('-')[0]
accumulated_vec['Date']=file_path[-1].split('-')[-1][:-3]
file_usage.acquire()
accumulated_vec.to_csv(path, mode='a',header=False)
file_usage.release()
#returns "naked" data without any comments or bodies
def pre_clean(file):
data = open(file,encoding="utf-8").read()
data = re.sub(re.compile('\'.*?\''), '', data) # removes Strings they could be misleading otherwise
data = re.sub(re.compile('/\*.*?\*/', re.DOTALL), '',data) # removes (/*COMMENT */)
data = re.sub(re.compile('//.*?\n'), '',data) # removes (//COMMENT\n )
data = re.sub(re.compile(':.*?;', re.DOTALL), '',data) # removes Rule bodies (need Strings to be removed first
data = re.sub(re.compile('@.*?}', re.DOTALL), '',data)
for el in ['fragment','mode','options',';','{','}']:
data = re.sub(el, '', data)
data = '\n'.join([line for line in data.splitlines() if line.strip()!='']) # removes empty lines
return data
# returns a Dataframe with file caracteristics
# grammar-name, type(grammar|parser|), tokenvocab,rule-count,rule-hash
def vectorize(data):
name = '-'
teip ='-'
toVo=''
rule_hash = 0
rule_count = 0
for line in data.splitlines():
if 'tokenVocab' in line:
toVo=line.split('=')[-1].strip()
if 'grammar' in line:
x=line.split()
name=x[-1].strip()
if len(x) ==3:
teip=x[-2].strip()
else:
rule_count += 1
rule_hash += hash(line)
norm_hash = rule_hash / rule_count
df= pd.DataFrame({'Name':[name], 'Type':[teip], 'TokenVocab':[toVo], 'RuleCount':[rule_count], 'RuleHash':[rule_hash]})
return df
def loaddistributor(filepath, aim_dir):
x=0
'C:\\Users\Admin\\Documents\\Studium\MSR\\test.csv'
for root, dirs, files in os.walk(filepath):
test = threading.Thread(target=prep_func, args=[files, aim_dir])
test.start()
print('thread '+ str(x)+ ' started')
x+=1
#mengenbegrenzung?
example_file = 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Chris2011DotNetCore-for-NetBeans\\CSharpParser-1502210733.g4'
example_file_2= 'C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\Blazebitblaze-persistence\\JPQLSelectExpressionParser-1555255937.g4'
loaddistributor('C:\\Users\Admin\\Documents\\Studium\MSR\\Versioning\\','C:\\Users\Admin\\Documents\\Studium\MSR\\test.csv')
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
GitPython was originally written by Michael Trier.
GitPython 0.2 was partially (re)written by Sebastian Thiel, based on 0.1.6 and git-dulwich.
Contributors are:
-Michael Trier <mtrier _at_ gmail.com>
-Alan Briolat
-Florian Apolloner <florian _at_ apolloner.eu>
-David Aguilar <davvid _at_ gmail.com>
-Jelmer Vernooij <jelmer _at_ samba.org>
-Steve Frécinaux <code _at_ istique.net>
-Kai Lautaportti <kai _at_ lautaportti.fi>
-Paul Sowden <paul _at_ idontsmoke.co.uk>
-Sebastian Thiel <byronimo _at_ gmail.com>
-Jonathan Chu <jonathan.chu _at_ me.com>
-Vincent Driessen <me _at_ nvie.com>
-Phil Elson <pelson _dot_ pub _at_ gmail.com>
-Bernard `Guyzmo` Pratz <guyzmo+gitpython+pub@m0g.net>
-Timothy B. Hartman <tbhartman _at_ gmail.com>
-Konstantin Popov <konstantin.popov.89 _at_ yandex.ru>
-Peter Jones <pjones _at_ redhat.com>
-Anson Mansfield <anson.mansfield _at_ gmail.com>
-Ken Odegard <ken.odegard _at_ gmail.com>
-Alexis Horgix Chotard
-Piotr Babij <piotr.babij _at_ gmail.com>
-Mikuláš Poul <mikulaspoul _at_ gmail.com>