Title: | R Source Code Similarity Evaluation by Variable/Function Names |
---|---|
Description: | Evaluates R source codes by variable and/or functions names. Similar source codes should deliver similarity coefficients near one. Since neither the frequency nor the order of the used names is considered, a manual inspection of the R source code is required to check for similarity. Possible use cases include detection of code clones for improving software quality and of plagiarism amongst students' assignments. |
Authors: | Sigbert Klinke [aut, cre] |
Maintainer: | Sigbert Klinke <[email protected]> |
License: | GPL-3 |
Version: | 0.2.1 |
Built: | 2024-11-14 03:13:36 UTC |
Source: | https://github.com/sigbertklinke/rscc |
Converts a data frame of similarity coefficients into a graph.
as_igraph(x, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ...)
as_igraph(x, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ...)
x |
a similarity object |
tol |
numeric scalar >= 0. Smaller differences are not
considered, see |
tol1 |
numeric scalar >= 0. |
... |
further parameters used by igraph::graph_from_adjacency_matrix |
an igraph object
files <- list.files(path=system.file("examples", package="rscc"), pattern="*.R$", full.names = TRUE) prgs <- sourcecode(files, title=basename(files)) docs <- documents(prgs) simm <- similarities(docs) # a similarity coefficients equal to zero does not create an edge! g <- as_igraph(simm, diag=FALSE) # thicker edges have higher similarity coefficients plot(g, edge.width=1+3*igraph::E(g)$weight)
files <- list.files(path=system.file("examples", package="rscc"), pattern="*.R$", full.names = TRUE) prgs <- sourcecode(files, title=basename(files)) docs <- documents(prgs) simm <- similarities(docs) # a similarity coefficients equal to zero does not create an edge! g <- as_igraph(simm, diag=FALSE) # thicker edges have higher similarity coefficients plot(g, edge.width=1+3*igraph::E(g)$weight)
Creates a temporary HTML file with source codes and opens it into a browser using browseURL
.
Note that the source code is reformatted.
browse(prgs, simdf, n = (simdf[, 3] > 0), width.cutoff = 60, css = NULL)
browse(prgs, simdf, n = (simdf[, 3] > 0), width.cutoff = 60, css = NULL)
prgs |
sourcecode object |
simdf |
similarity object |
n |
integer: comparisons to show (default: |
width.cutoff |
integer: an integer in [20, 500]: if a line's character length is at or over this number, the function will try to break it into a new line (default: |
css |
character: file name of CSS style for highlighting the R code |
invisibly the name of the temporary HTML file
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files) simm <- similarities(documents(prgs)) simdf <- matrix2dataframe(simm) if (interactive()) browse(prgs, simdf)
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files) simm <- similarities(documents(prgs)) simdf <- matrix2dataframe(simm) if (interactive()) browse(prgs, simdf)
Creates word vectors from parsed sourec code objects. If
type=="vars"
then the names of all.vars(.)
,
type=="funs"
then the namas of setdiff(all.names(.), all.vars(.)
, and
type=="names"
then the names of all.names(.)
are used.
documents( prgs, type = c("vars", "funs", "names"), ignore.case = TRUE, minlen = 2, ... )
documents( prgs, type = c("vars", "funs", "names"), ignore.case = TRUE, minlen = 2, ... )
prgs |
prgs sourcecode object |
type |
character: either |
ignore.case |
logical: If TRUE, case is ignored for computing (default: |
minlen |
integer: minimal name length to be considered (default: |
... |
unused |
a
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) docs
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) docs
Computes a frequency table of words and documents.
freq_table(docs, ...)
freq_table(docs, ...)
docs |
documents object |
... |
unused |
a matrix with similarities
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) freq_table (docs)
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) freq_table (docs)
Converts a numeric matrix to a data frame with decreasing or increasing values: First column row index, second column col index and third column the value. If the matrix is symmetric, only the upper triangle is taken into account.
matrix2dataframe( m, decreasing = TRUE, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ... )
matrix2dataframe( m, decreasing = TRUE, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ... )
m |
numeric: a matrix of values |
decreasing |
logical: should the sort order be increasing or decreasing (default: |
tol |
numeric scalar >= 0. Smaller differences are not
considered, see |
tol1 |
numeric scalar >= 0. |
... |
further arguments passed to methods; the matrix method
passes these to |
a data frame with an attribute matrix
with m
# non-symmetric x <- matrix(runif(9), ncol=3) matrix2dataframe(x)
# non-symmetric x <- matrix(runif(9), ncol=3) matrix2dataframe(x)
same_file
same_file(m, replacement = 0)
same_file(m, replacement = 0)
m |
matrix object with row- and columnnames |
replacement |
value for replacement (default: |
matrix
m <- matrix(runif(25), ncol=5) colnames(m) <- rownames(m) <- c(sprintf("m[%.f]", 1:3), sprintf("m2[%.f]", 1:2)) m same_file(m)
m <- matrix(runif(25), ncol=5) colnames(m) <- rownames(m) <- c(sprintf("m[%.f]", 1:3), sprintf("m2[%.f]", 1:2)) m same_file(m)
Internal function for faster computation. No checks on input will be performed.
sim_coeff(set1, set2, setfull, coeff)
sim_coeff(set1, set2, setfull, coeff)
set1 |
character: unique vector of words |
set2 |
character: unique vector of words |
setfull |
character: unique vector of texts to compare |
coeff |
character: name of similarity coefficient to use |
value of similarity coefficient
Computes a similarity coefficient based on the unique elements set1
and set2
in relation to setfull
. If setfull
is NULL
then setfull
is set
to unique(c(set1, set2))
. For more details, see the vignette vignette("rscc")
.
similarity_coeff( set1, set2, setfull = NULL, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
similarity_coeff( set1, set2, setfull = NULL, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
set1 |
vector: elements to compare |
set2 |
vector: elements to compare |
setfull |
vector: elements to compare (default: |
coeff |
character: coefficient to compute (default: |
a numeric similarity coefficient
s1 <- 1:3 s2 <- 1:5 similarity_coeff(s1, s2) s1 <- letters[1:3] s2 <- LETTERS[1:5] similarity_coeff(s1, s2)
s1 <- 1:3 s2 <- 1:5 similarity_coeff(s1, s2) s1 <- letters[1:3] s2 <- LETTERS[1:5] similarity_coeff(s1, s2)
sims
and similarities
both calculate for each pair of source code objects
the similarity coefficients and return a data frame with the coefficients in descending order.
A larger coefficient means a greater similarity.
sims(...) similarities( docs, all = FALSE, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
sims(...) similarities( docs, all = FALSE, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
... |
all parameters in |
docs |
document object |
all |
logical: should the similarity coefficients computed based on all sourcecode objects or just the two considered (default: |
coeff |
character: coefficient to compute (default: |
a data frame with the results
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) similarities(docs) # further steps # m <- similarities(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) similarities(docs) # further steps # m <- similarities(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)
Reads and parses files with R source code.
sourcecode(x, ...) ## Default S3 method: sourcecode(x, title = x, silent = FALSE, minlines = -1, ...)
sourcecode(x, ...) ## Default S3 method: sourcecode(x, title = x, silent = FALSE, minlines = -1, ...)
x |
character: filenames |
... |
unused |
title |
character: vector of program titles (default: |
silent |
logical: should the report of messages be suppressed (default: |
minlines |
integer: only expressions with |
a sourcecode object
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files)
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files)
Computes the term frequency–inverse document frequency uses tha cosine of the angles between the documents as similarity measure. Since R source code is provided no stemming or stop words are applied.
tfidf(docs)
tfidf(docs)
docs |
document object |
similarity matrix
files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names = TRUE) prgs <- sourcecode(files, basename=TRUE, silent=TRUE) docs <- documents(prgs) tfidf(docs) # further steps # m <- tfidf(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)
files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names = TRUE) prgs <- sourcecode(files, basename=TRUE, silent=TRUE) docs <- documents(prgs) tfidf(docs) # further steps # m <- tfidf(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)