| Title: | R Source Code Similarity Evaluation by Variable/Function Names |
|---|---|
| Description: | Evaluates R source codes by variable and/or functions names. Similar source codes should deliver similarity coefficients near one. Since neither the frequency nor the order of the used names is considered, a manual inspection of the R source code is required to check for similarity. Possible use cases include detection of code clones for improving software quality and of plagiarism amongst students' assignments. |
| Authors: | Sigbert Klinke [aut, cre] |
| Maintainer: | Sigbert Klinke <[email protected]> |
| License: | GPL-3 |
| Version: | 0.2.1 |
| Built: | 2026-05-13 07:05:30 UTC |
| Source: | https://github.com/sigbertklinke/rscc |
Converts a data frame of similarity coefficients into a graph.
as_igraph(x, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ...)as_igraph(x, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ...)
x |
a similarity object |
tol |
numeric scalar >= 0. Smaller differences are not
considered, see |
tol1 |
numeric scalar >= 0. |
... |
further parameters used by igraph::graph_from_adjacency_matrix |
an igraph object
files <- list.files(path=system.file("examples", package="rscc"), pattern="*.R$", full.names = TRUE) prgs <- sourcecode(files, title=basename(files)) docs <- documents(prgs) simm <- similarities(docs) # a similarity coefficients equal to zero does not create an edge! g <- as_igraph(simm, diag=FALSE) # thicker edges have higher similarity coefficients plot(g, edge.width=1+3*igraph::E(g)$weight)files <- list.files(path=system.file("examples", package="rscc"), pattern="*.R$", full.names = TRUE) prgs <- sourcecode(files, title=basename(files)) docs <- documents(prgs) simm <- similarities(docs) # a similarity coefficients equal to zero does not create an edge! g <- as_igraph(simm, diag=FALSE) # thicker edges have higher similarity coefficients plot(g, edge.width=1+3*igraph::E(g)$weight)
Creates a temporary HTML file with source codes and opens it into a browser using browseURL.
Note that the source code is reformatted.
browse(prgs, simdf, n = (simdf[, 3] > 0), width.cutoff = 60, css = NULL)browse(prgs, simdf, n = (simdf[, 3] > 0), width.cutoff = 60, css = NULL)
prgs |
sourcecode object |
simdf |
similarity object |
n |
integer: comparisons to show (default: |
width.cutoff |
integer: an integer in [20, 500]: if a line's character length is at or over this number, the function will try to break it into a new line (default: |
css |
character: file name of CSS style for highlighting the R code |
invisibly the name of the temporary HTML file
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files) simm <- similarities(documents(prgs)) simdf <- matrix2dataframe(simm) if (interactive()) browse(prgs, simdf)# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files) simm <- similarities(documents(prgs)) simdf <- matrix2dataframe(simm) if (interactive()) browse(prgs, simdf)
Creates word vectors from parsed sourec code objects. If
type=="vars" then the names of all.vars(.),
type=="funs" then the namas of setdiff(all.names(.), all.vars(.), and
type=="names" then the names of all.names(.)
are used.
documents( prgs, type = c("vars", "funs", "names"), ignore.case = TRUE, minlen = 2, ... )documents( prgs, type = c("vars", "funs", "names"), ignore.case = TRUE, minlen = 2, ... )
prgs |
prgs sourcecode object |
type |
character: either |
ignore.case |
logical: If TRUE, case is ignored for computing (default: |
minlen |
integer: minimal name length to be considered (default: |
... |
unused |
a
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) docs# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) docs
Computes a frequency table of words and documents.
freq_table(docs, ...)freq_table(docs, ...)
docs |
documents object |
... |
unused |
a matrix with similarities
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) freq_table (docs)# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) freq_table (docs)
Converts a numeric matrix to a data frame with decreasing or increasing values: First column row index, second column col index and third column the value. If the matrix is symmetric, only the upper triangle is taken into account.
matrix2dataframe( m, decreasing = TRUE, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ... )matrix2dataframe( m, decreasing = TRUE, tol = 100 * .Machine$double.eps, tol1 = 8 * tol, ... )
m |
numeric: a matrix of values |
decreasing |
logical: should the sort order be increasing or decreasing (default: |
tol |
numeric scalar >= 0. Smaller differences are not
considered, see |
tol1 |
numeric scalar >= 0. |
... |
further arguments passed to methods; the matrix method
passes these to |
a data frame with an attribute matrix with m
# non-symmetric x <- matrix(runif(9), ncol=3) matrix2dataframe(x)# non-symmetric x <- matrix(runif(9), ncol=3) matrix2dataframe(x)
same_file
same_file(m, replacement = 0)same_file(m, replacement = 0)
m |
matrix object with row- and columnnames |
replacement |
value for replacement (default: |
matrix
m <- matrix(runif(25), ncol=5) colnames(m) <- rownames(m) <- c(sprintf("m[%.f]", 1:3), sprintf("m2[%.f]", 1:2)) m same_file(m)m <- matrix(runif(25), ncol=5) colnames(m) <- rownames(m) <- c(sprintf("m[%.f]", 1:3), sprintf("m2[%.f]", 1:2)) m same_file(m)
Internal function for faster computation. No checks on input will be performed.
sim_coeff(set1, set2, setfull, coeff)sim_coeff(set1, set2, setfull, coeff)
set1 |
character: unique vector of words |
set2 |
character: unique vector of words |
setfull |
character: unique vector of texts to compare |
coeff |
character: name of similarity coefficient to use |
value of similarity coefficient
Computes a similarity coefficient based on the unique elements set1 and set2
in relation to setfull. If setfull is NULL then setfull is set
to unique(c(set1, set2)). For more details, see the vignette vignette("rscc").
similarity_coeff( set1, set2, setfull = NULL, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )similarity_coeff( set1, set2, setfull = NULL, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
set1 |
vector: elements to compare |
set2 |
vector: elements to compare |
setfull |
vector: elements to compare (default: |
coeff |
character: coefficient to compute (default: |
a numeric similarity coefficient
s1 <- 1:3 s2 <- 1:5 similarity_coeff(s1, s2) s1 <- letters[1:3] s2 <- LETTERS[1:5] similarity_coeff(s1, s2)s1 <- 1:3 s2 <- 1:5 similarity_coeff(s1, s2) s1 <- letters[1:3] s2 <- LETTERS[1:5] similarity_coeff(s1, s2)
sims and similarities both calculate for each pair of source code objects
the similarity coefficients and return a data frame with the coefficients in descending order.
A larger coefficient means a greater similarity.
sims(...) similarities( docs, all = FALSE, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )sims(...) similarities( docs, all = FALSE, coeff = c("jaccard", "braun", "dice", "hamann", "kappa", "kulczynski", "ochiai", "phi", "russelrao", "matching", "simpson", "sneath", "tanimoto", "yule") )
... |
all parameters in |
docs |
document object |
all |
logical: should the similarity coefficients computed based on all sourcecode objects or just the two considered (default: |
coeff |
character: coefficient to compute (default: |
a data frame with the results
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) similarities(docs) # further steps # m <- similarities(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files, basename=TRUE) docs <- documents(prgs) similarities(docs) # further steps # m <- similarities(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)
Reads and parses files with R source code.
sourcecode(x, ...) ## Default S3 method: sourcecode(x, title = x, silent = FALSE, minlines = -1, ...)sourcecode(x, ...) ## Default S3 method: sourcecode(x, title = x, silent = FALSE, minlines = -1, ...)
x |
character: filenames |
... |
unused |
title |
character: vector of program titles (default: |
silent |
logical: should the report of messages be suppressed (default: |
minlines |
integer: only expressions with |
a sourcecode object
# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files)# example files are taken from https://CRAN.R-project.org/package=SimilaR files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names=TRUE) prgs <- sourcecode(files)
Computes the term frequency–inverse document frequency uses tha cosine of the angles between the documents as similarity measure. Since R source code is provided no stemming or stop words are applied.
tfidf(docs)tfidf(docs)
docs |
document object |
similarity matrix
files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names = TRUE) prgs <- sourcecode(files, basename=TRUE, silent=TRUE) docs <- documents(prgs) tfidf(docs) # further steps # m <- tfidf(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)files <- list.files(system.file("examples", package="rscc"), "*.R$", full.names = TRUE) prgs <- sourcecode(files, basename=TRUE, silent=TRUE) docs <- documents(prgs) tfidf(docs) # further steps # m <- tfidf(docs) # df <- matrix2dataframe(m) # head(df, n=20) # browse(prgs, df, n=5)