2021
Nadeem, Anas; Sarwar, Muhammad Usman; Malik, Muhammad Zubair
Automatic Issue Classifier: A Transfer Learning Framework for Classifying Issue Reports Inproceedings
In: 2021 IEEE International Symposium on Software Reliability Engineering Workshops (ISSREW), pp. 421-426, 2021.
Abstract | Links | BibTeX | Tags: BERT, classification, github, github apps, issue classification, Software engineering
@inproceedings{9700173,
title = {Automatic Issue Classifier: A Transfer Learning Framework for Classifying Issue Reports},
author = {Anas Nadeem and Muhammad Usman Sarwar and Muhammad Zubair Malik},
doi = {10.1109/ISSREW53611.2021.00113},
year = {2021},
date = {2021-10-01},
urldate = {2021-10-01},
booktitle = {2021 IEEE International Symposium on Software Reliability Engineering Workshops (ISSREW)},
pages = {421-426},
abstract = {Issue tracking systems are used in the software industry for the facilitation of maintenance activities that keep the software robust and up to date with ever-changing industry requirements. Usually, users report issues that can be categorized into different labels such as bug reports, enhancement requests, and questions related to the software. Most of the issue tracking systems make the labelling of these issue reports optional for the issue submitter, which leads to a large number of unlabeled issue reports. In this paper, we present a state-of-the-art method to classify the issue reports into their respective categories i.e. bug, enhancement, and question. This is a challenging task because of the common use of informal language in the issue reports. Existing studies use traditional natural language processing approaches adopting key-word based features, which fail to incorporate the contextual relationship between words and therefore result in a high rate of false positives and false negatives. Moreover, previous works utilize a uni-label approach to classify the issue reports however, in reality, an issue-submitter can tag one issue report with more than one label at a time. This paper presents our approach to classify the issue reports in a multi-label setting. We use an off-the-shelf neural network called RoBERTa and fine-tune it to classify the issue reports. We validate our approach on issue reports belonging to numerous industrial projects from GitHub. We were able to achieve promising F-1 scores of 81 %, 74%, and 80% for bug reports, enhancements, and questions, respectively. We also develop an industry tool called Automatic Issue Classifier (AIC), which automatically assigns labels to newly reported issues on GitHub repositories with high accuracy.},
keywords = {BERT, classification, github, github apps, issue classification, Software engineering},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Sarwar, Muhammad Usman; Zafar, Sarim; Mkaouer, Mohamed Wiem; Walia, Gursimran Singh; Malik, Muhammad Zubair
Multi-label Classification of Commit Messages using Transfer Learning Inproceedings
In: 2020 IEEE International Symposium on Software Reliability Engineering Workshops (ISSREW), pp. 37-42, 2020.
Abstract | Links | BibTeX | Tags: BERT, classification, github, transfer learning
@inproceedings{9307651,
title = {Multi-label Classification of Commit Messages using Transfer Learning},
author = {Muhammad Usman Sarwar and Sarim Zafar and Mohamed Wiem Mkaouer and Gursimran Singh Walia and Muhammad Zubair Malik},
doi = {10.1109/ISSREW51248.2020.00034},
year = {2020},
date = {2020-10-01},
urldate = {2020-10-01},
booktitle = {2020 IEEE International Symposium on Software Reliability Engineering Workshops (ISSREW)},
pages = {37-42},
abstract = {Commit messages are used in the industry by developers to annotate changes made to the code. Accurate classification of these messages can help monitor the software evolution process and enable better tracking for various industrial stakeholders. In this paper, we present a state of the art method for commit message classification into categories as per Swanson's maintenance activities i.e. “Corrective”, “Perfective”, and “Adaptive”. This is a challenging task because not all commit messages are well written and informative. Existing approaches rely on keyword-based techniques to solve this problem. However, these approaches are oblivious to the full language model and do not recognize the contextual relationship between words. State of the art methodology in Natural Language Processing (NLP), is to train a context-aware neural network (Transformer) on a very large data set that encompasses the entire language and then fine-tunes it for a specific task. In this way, the model can learn the language, pay attention to the context, and then transfer that knowledge for better performance at the specific task. We use an off-the-shelf neural network called DistilBERT and fine-tune it for commit message classification task. This step is non-trivial because programming languages and commit messages have unique keywords, jargon, and idioms. This paper presents our effort in training this model and constructing the data set for this task. We describe the rules used to construct the data set. We validate our approach on industrial projects from GitHub, such as Kubernetes, Linux, TensorFlow, Spark, TypeScript, and PyTorch. We were able to achieve 87% F1-score for the commit message classification task, which is an order of magnitude accurate than previous studies.},
keywords = {BERT, classification, github, transfer learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Zafar, Sarim; Sarwar, Muhammad Usman; Salem, Saeed; Malik, Muhammad Zubair
Language and Obfuscation Oblivious Source Code Authorship Attribution Journal Article
In: IEEE Access, vol. 8, pp. 197581-197596, 2020, ISSN: 2169-3536.
Abstract | Links | BibTeX | Tags: artificial neural networks, natural language processing, Software engineering
@article{9245552,
title = {Language and Obfuscation Oblivious Source Code Authorship Attribution},
author = {Sarim Zafar and Muhammad Usman Sarwar and Saeed Salem and Muhammad Zubair Malik},
doi = {10.1109/ACCESS.2020.3034932},
issn = {2169-3536},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {IEEE Access},
volume = {8},
pages = {197581-197596},
abstract = {Source Code Authorship Attribution can answer many interesting questions such as: Who wrote the malicious source code? Is the source code plagiarized, and does it infringe on copyright? Source Code Authorship Attribution is done by observing distinctive patterns of style in a source code whose author is unknown and comparing them with patterns learned from known authors' source codes. In this paper, we present an efficient approach to learn a novel representation using deep metric learning. The existing state of the art approaches tokenize the source code and work on the keyword level, limiting the elements of style they can consider. Our approach uses the raw character stream of source code. It can examine keywords and different stylistic features such as variable naming conventions or using tabs vs. spaces, enabling us to learn a richer representation than other keyword-based approaches. Our approach uses a character-level Convolutional Neural Network (CNN). We train the CNN to map the input character stream to a dense vector, mapping the source codes authored by the same author close to each other. In contrast, source codes written by different programmers are mapped farther apart in the embedding space. We then feed these source code vectors into the K-nearest neighbor (KNN) classifier that uses Manhattan-distance to perform authorship attribution. We validated our approach on Google Code Jam (GCJ) dataset across three different programming languages. We prepare our large-scale dataset in such a way that it does not induce type-I error. Our approach is more scalable and efficient than existing methods. We were able to achieve an accuracy of 84.94% across 20,458 authors, which is more than twice the scale of any previous study under a much more challenging setting.},
keywords = {artificial neural networks, natural language processing, Software engineering},
pubstate = {published},
tppubtype = {article}
}
2019
Zafar, Sarim; Malik, Muhammad Zubair; Walia, Gursimran Singh
Towards Standardizing and Improving Classification of Bug-Fix Commits Inproceedings
In: 2019 ACM/IEEE International Symposium on Empirical Software Engineering and Measurement (ESEM), pp. 1-6, 2019, ISSN: 1949-3789.
Abstract | Links | BibTeX | Tags: classification, github
@inproceedings{8870174,
title = {Towards Standardizing and Improving Classification of Bug-Fix Commits},
author = {Sarim Zafar and Muhammad Zubair Malik and Gursimran Singh Walia},
doi = {10.1109/ESEM.2019.8870174},
issn = {1949-3789},
year = {2019},
date = {2019-09-01},
urldate = {2019-09-01},
booktitle = {2019 ACM/IEEE International Symposium on Empirical Software Engineering and Measurement (ESEM)},
pages = {1-6},
abstract = {Background: Open source software repositories like GitHub are mined to gain useful empirical software engineering insights and answer critical research questions. However, the present state of the art mining approaches suffers from high error rate in the labeling of data that is used for such analysis. This is particularly true when labels are automatically generated from the commit message, and seriously undermines the results of these studies. Aim: Our goal is to label commit comments with high accuracy automatically. In this work, we focus on classifying a commit as a “Bug-Fix commit” or not. Method: Traditionally, researchers have utilized keyword-based approaches to identify bug fix commits that leads to a significant increase in the error rate. We present an alternative methodology leveraging a deep neural network model called Bidirectional Encoder Representations from Transformers (BERT) that can understand the context of the commit message. We provide the rules for semantic interpretation of commit comments. We construct a hand-labeled dataset from real GitHub commits according to these rules and fine-tune BERT for classification. Results: Our initial evaluation shows that our approach significantly reduces the error rate, with up to 10% relative improvement in classification over keyword-based approaches. Future Direction: We plan on extending our dataset to cover more corner cases and reduce programming language specific biases. We also plan on refining the semantic rules. In this work, we have only considered a simple binary classification problem (Bug-Fix or not), which we plan to extend to other classes and extend the approach to consider multiclass problems. Conclusion: The rules, data, and the model proposed in this paper have the potential to be used by people analyzing open source repositories to improve the labeling of data used in their analysis.},
keywords = {classification, github},
pubstate = {published},
tppubtype = {inproceedings}
}