728x90
import re
import pandas as pd
def reformat_braces(lines):
"""중괄호를 일반적인 위치로 재배치하는 함수"""
reformatted_lines = []
for line in lines:
stripped_line = line.strip()
if stripped_line.endswith("{") and len(stripped_line) > 1:
reformatted_lines.append(stripped_line[:-1].strip()) # 중괄호 제거 후 추가
reformatted_lines.append("{")
elif stripped_line.startswith("}") and len(stripped_line) > 1:
reformatted_lines.append("}") # 닫는 중괄호만 추가
reformatted_lines.append(stripped_line[1:].strip()) # 중괄호 뒤에 내용 추가
else:
reformatted_lines.append(stripped_line)
return reformatted_lines
def auto_indent_java_code(lines: str) -> str:
# lines = code.split("\n")
indent_level = 0
indent_space = " " # 4 spaces for indentation
formatted_lines = []
for line in lines:
stripped_line = line.strip()
# Ignore empty lines
if not stripped_line:
formatted_lines.append('')
continue
# If the line ends with a closing brace, decrease indent first
if stripped_line.endswith("}") and not stripped_line.startswith("else") and not stripped_line.startswith("catch") and not stripped_line.startswith("finally"):
indent_level -= 1
formatted_lines.append(f"{indent_space * indent_level}{stripped_line}")
continue
# Add the current line with the correct indentation
formatted_lines.append(f"{indent_space * indent_level}{stripped_line}")
# If the line contains an opening brace, increase the indent for the following line
if stripped_line.endswith("{"):
indent_level += 1
return formatted_lines
# def find_string_usage(lines, target_string):
# result = []
# # 1. Check for the String type usage (variable assignment)
# string_pattern = re.compile(r'\b([a-zA-Z_][a-zAZ0-9_]*)\s*=\s*"([^"]*)"')
# for idx, line in enumerate(lines, 1):
# match = string_pattern.search(line)
# if match and match.group(2) == target_string:
# result.append({
# "Type": "String",
# "Name": match.group(1),
# "Value": match.group(2),
# "Line": idx
# })
# # 2. Check for Function type usage (where the string is passed as a value)
# function_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*{')
# # Extended control flow keywords to exclude (includes common flow control statements)
# control_flow_keywords = {
# 'for', 'if', 'while', 'switch', 'try', 'catch', 'finally', 'do'
# }
# for idx, line in enumerate(lines, 1):
# match = function_pattern.search(line)
# if match:
# function_name = match.group(1) # Function name is in the first capture group
# # Skip if the function is a control flow statement (e.g., 'for', 'if', etc.)
# if function_name in control_flow_keywords:
# continue
# arguments = match.group(2) # Function arguments are in the second capture group
# if target_string in arguments:
# result.append({
# "Type": "Function",
# "Name": function_name,
# "Value": target_string,
# "Line": idx
# })
# return result
def reformat_java_braces(lines: list) -> str:
formatted_lines = []
# Iterate through each line to adjust curly braces placement
for i, line in enumerate(lines):
stripped_line = line.strip()
# If the line has an opening brace, add it to the previous line
if stripped_line == "{":
if formatted_lines:
formatted_lines[-1] = formatted_lines[-1].rstrip() + " {"
else:
formatted_lines.append("{")
# If the line has a closing brace, append it as a new line
elif stripped_line == "}":
formatted_lines.append(line)
else:
formatted_lines.append(line)
return '\n'.join(formatted_lines)
# def find_string_usage(lines, target_string):
# result = []
# # 1. Check for the String type usage (variable assignment)
# string_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"([^"]*)"')
# for idx, line in enumerate(lines, 1):
# match = string_pattern.search(line)
# if match and match.group(2) == target_string:
# print(line)
# result.append({
# "Type": "String",
# "Name": match.group(1),
# "Value": match.group(2),
# "Line": idx
# })
# # 2. Check for Function type usage (where the string is passed as a value)
# function_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*{')
# # Extended control flow keywords to exclude (includes common flow control statements)
# control_flow_keywords = {
# 'for', 'if', 'while', 'switch', 'try', 'catch', 'finally', 'do', 'else if', 'else'
# }
# for idx, line in enumerate(lines, 1):
# match = function_pattern.search(line)
# if match:
# print(line)
# function_name = match.group(1) # Function name is in the first capture group
# # Skip if the function is a control flow statement (e.g., 'for', 'if', etc.)
# if function_name in control_flow_keywords:
# continue
# arguments = match.group(2) # Function arguments are in the second capture group
# if target_string in arguments:
# result.append({
# "Type": "Function",
# "Name": function_name,
# "Value": target_string,
# "Line": idx
# })
# return result
def find_functions_and_objects(lines, target_string):
result = []
target_objects = set() # Set to store unique object names using target_string
# Step 1: Find string objects using target_string
# string_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"([^"]*)"')
# string_pattern = r'String\s+(\w+)\s*=\s*"([^"]*' + re.escape(target_string) + r'[^"]*)";'
string_pattern = re.compile(
r'String\s+(\w+)\s*=\s*"([^"]*' + re.escape(target_string) + r'[^"]*)";|(\w+\.\w+\s*\(\s*"[^"]*\b' + re.escape(target_string) + r'\b[^"]*"\s*\))'
)
# string_pattern = r'"([^"]*' + re.escape(target_string) + r'[^"]*)";'
for idx, line in enumerate(lines, 1):
# match = string_pattern.search(line)
matches = re.finditer(string_pattern, line)
for match in matches:
if match : # and match.group(2) == target_string:
# print( match.group(1))
if match.group(1) is None :
target_objects.add(target_string)
result.append({
"Type": "String",
"Name": target_string,
"Value": target_string,
"Line": idx
})
else :
target_objects.add(match.group(1))
result.append({
"Type": "String",
"Name": match.group(1),
"Value": match.group(2),
"Line": idx
})
string_df=pd.DataFrame(result)
# print(result)
# print(target_objects)
# Step 2: Find functions and check usage of target_objects
function_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*{')
control_flow_keywords = {'for', 'if', 'while', 'switch', 'try', 'catch', 'finally', 'do', 'else if', 'else'}
inside_function = False
current_function = None
for idx, line in enumerate(lines, 1):
match = function_pattern.search(line)
# Detect function definition
if match:
function_name = match.group(1)
if function_name not in control_flow_keywords:
# print(function_name)
inside_function = True
current_function = function_name
continue
# If inside a function, check for target_objects usage
if inside_function and current_function:
for obj in target_objects:
# if target_string in line :
# result.append({
# "Type": "Function",
# "Name": current_function,
# "Value": target_string,
# "Line": idx
# })
# elif target_string not in line :
if obj in line : # Check if the object is used in the current line
if idx in list(set(list(string_df['Line']))) :
result.append({
"Type": "Function",
"Name": current_function,
"Value": target_string,
"Line": idx
})
else :
result.append({
"Type": "Function",
"Name": current_function,
"Value": obj,
"Line": idx
})
# Detect end of function block (simple heuristic for this example)
if "}" in line: # This is a simplification; may need to track braces for nested functions
inside_function = False
current_function = None
return result
def extract_string_and_function_details(file_path, search_string):
comment_pattern = r"(//.*?$|/\*.*?\*/|/\*\*.*?\*/)" # 주석 제거
results = []
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 주석 제거
content_no_comments = re.sub(comment_pattern, "", ''.join(lines), flags=re.S | re.M)
lines = content_no_comments.splitlines()
non_empty_lines = [line for line in lines if line.strip()] # 공백 제거
reformatted_lines = reformat_braces(non_empty_lines)
# 저장
with open(file_path.split(".java")[0]+"_1"+".java", 'w', encoding='utf-8') as file:
file.write('\n'.join(reformatted_lines))
# print(reformatted_lines)
# 자동 정렬: 들여쓰기 처리
formatted_lines = auto_indent_java_code(reformatted_lines)
# TODO 함수 다시 찾기 (){}
# print(formatted_lines[0])
# 결과 저장
with open(file_path.split(".java")[0]+"_2"+".java", 'w', encoding='utf-8') as file:
file.write('\n'.join(formatted_lines))
last_lines=reformat_java_braces(formatted_lines)
# 결과 저장
with open(file_path.split(".java")[0]+"_3"+".java", 'w', encoding='utf-8') as file:
file.write(last_lines)
# usage_info = find_string_usage(last_lines.split('\n'), search_string)
usage_info = find_functions_and_objects(last_lines.split('\n'), search_string)
return usage_info
실행 영역
# 사용 예제
file_path = "KNNClassifier.java" # 분석할 Java 파일 경로
# nocomment_file_path = "KNNClassifier_NoComment.java"
# reformatted_file_path = "KNNClassifier_reformatted.java"
main_path = "C:/Users/l4rea/Documents/codes/python/project/05_pmutil/java_db/"
search_string = "TEST_DB" # 검색할 문자열 값
occurrences = extract_string_and_function_details(main_path+file_path, search_string)
# 결과 출력
print("Occurrences of search_string:")
for occurrence in occurrences:
print(occurrence)