R实现的最小二乘lsfit函数学习

1.源码

function (x, y, wt = NULL, intercept = TRUE, tolerance = 1e-,

          yname = NULL)

{

  x <- as.matrix(x)

  y <- as.matrix(y)

  xnames <- colnames(x)#x的列名

  if (is.null(xnames)) {

    if (ncol(x) == 1L) #赋予列名

      xnames <- "X"

    else xnames <- paste0("X", 1L:ncol(x))

  }

  if (intercept) {#如果有截距的话，那么就多赋值一个列。

    x <- cbind(, x)#按列合并，那么行数一样，即每行为一个样本

    xnames <- c("Intercept", xnames)

  }

  if (is.null(yname) && ncol(y) > )

    yname <- paste0("Y", 1L:ncol(y))

  good <- complete.cases(x, y, wt)#去除NA值。这里暗示了x y行数是一样的？

  dimy <- dim(as.matrix(y))

  if (any(!good)) {#如果至少有一个是false

    warning(sprintf(ngettext(sum(!good), "%d missing value deleted",

                             "%d missing values deleted"), sum(!good)), domain = NA)

    x <- as.matrix(x)[good, , drop = FALSE]

    y <- as.matrix(y)[good, , drop = FALSE]

    wt <- wt[good]

  }

  nrx <- NROW(x)

  ncx <- NCOL(x)

  nry <- NROW(y)

  ncy <- NCOL(y)

  nwts <- length(wt)

  if (nry != nrx) #如果x的样本数与y的样本数不相等

    stop(sprintf(paste0(ngettext(nrx, "'X' matrix has %d case (row)",

                                 "'X' matrix has %d cases (rows)"), ", ", ngettext(nry,

                                                                                   "'Y' has %d case (row)", "'Y' has %d cases (rows)")),

                 nrx, nry), domain = NA)

  if (nry < ncx)

    stop(sprintf(paste0(ngettext(nry, "only %d case", "only %d cases"),

                        ", ", ngettext(ncx, "but %d variable", "but %d variables")),

                 nry, ncx), domain = NA)

  if (!is.null(wt)) {#用于加权最小二乘

    if (any(wt < ))

      stop("negative weights not allowed")

    if (nwts != nry)

      stop(gettextf("number of weights = %d should equal %d (number of responses)",

                    nwts, nry), domain = NA)

    wtmult <- sqrt(wt)

    if (any(wt == )) {

      xzero <- as.matrix(x)[wt == , ]

      yzero <- as.matrix(y)[wt == , ]

    }

    x <- x * wtmult

    y <- y * wtmult

    invmult <- /ifelse(wt == , , wtmult)

  }

  z <- .Call(C_Cdqrls, x, y, tolerance, FALSE)#调用C中的函数

  resids <- array(NA, dim = dimy)

  dim(z$residuals) <- c(nry, ncy)

  if (!is.null(wt)) {

    if (any(wt == )) {

      if (ncx == 1L)

        fitted.zeros <- xzero * z$coefficients

      else fitted.zeros <- xzero %*% z$coefficients

      z$residuals[wt == , ] <- yzero - fitted.zeros

    }

    z$residuals <- z$residuals * invmult

  }

  resids[good, ] <- z$residuals#所有不含NA的行的残差被赋值为计算所得的残差

  if (dimy[2L] ==  && is.null(yname)) {#如果y只有一列

    resids <- drop(resids)#此时会变成一个向量

    names(z$coefficients) <- xnames#系数被赋值为x的列名，

    #因为一列正好对应的是一个变量，行对应的是样本数

  }

  else {

    colnames(resids) <- yname

    colnames(z$effects) <- yname

    dim(z$coefficients) <- c(ncx, ncy)#是5行，100列，正好就是系数矩阵H

    dimnames(z$coefficients) <- list(xnames, yname)#确实是对应，

    #行是细胞类型的名字，列是基因名字。

  }

  z$qr <- as.matrix(z$qr)

  colnames(z$qr) <- xnames

  output <- list(coefficients = z$coefficients, residuals = resids)

  #输出中有系数和残差

  if (z$rank != ncx) {

    xnames <- xnames[z$pivot]

    dimnames(z$qr) <- list(NULL, xnames)

    warning("'X' matrix was collinear")

  }

  if (!is.null(wt)) {

    weights <- rep.int(NA, dimy[1L])

    weights[good] <- wt

    output <- c(output, list(wt = weights))

  }

  rqr <- list(qt = drop(z$effects), qr = z$qr, qraux = z$qraux,

              rank = z$rank, pivot = z$pivot, tol = z$tol)

  oldClass(rqr) <- "qr"

  output <- c(output, list(intercept = intercept, qr = rqr))

  #这里intercept只是一个布尔值？？？

  return(output)

}

//感觉很坑啊，计算的部分是使用了

z <- .Call(C_Cdqrls, x, y, tolerance, FALSE)#调用C中的函数

> stats:::C_Cdqrls

$`name`

[] "Cdqrls"

$address

<pointer: 0x00000000025f96b0>

attr(,"class")

[] "RegisteredNativeSymbol"

$dll

DLL name: stats

Filename: D:/RRsetup/R-3.5./library/stats/libs/x64/stats.dll

Dynamic lookup: FALSE

$numParameters

[] 

attr(,"class")

[] "CallRoutine"      "NativeSymbolInfo"

这个就进行了最小二乘，具体的过程还是看不到....

//那似乎这个lsfit函数除了调用C函数计算之外，似乎也没什么了，就是去掉NA值，并且对结果进行整理，赋值列名行名之类的。

2.其中一些函数学习

2.1 paste0函数

> xy<-paste0("X",:)

> xy

[] "X1" "X2" "X3" "X4" "X5"

就是对其进行粘贴，形成一个字符串向量。

2.2 complete.cases函数——去除空值

转自：http://blog.sina.com.cn/s/blog_59990a450101qnvy.html

Value返回值为

A logical vector specifying which observations/rows have no missing values across the entire sequence.

一个逻辑向量，指明哪一行有缺失值NA

下面用实例来说明这两个函数的作用：

这是一个数据框final：

  gene hsap mmul mmus rnor cfam

1 ENSG00000208234 0 NA NA NA NA

2 ENSG00000199674 0 2 2 2 2

3 ENSG00000221622 0 NA NA NA NA

4 ENSG00000207604 0 NA NA 1 2

5 ENSG00000207431 0 NA NA NA NA

6 ENSG00000221312 0 1 2 3 2

如果要去除有NA的行，则可用：

final[complete.cases(final),]

也可用 na.omit(final)

//对行进行检测，没有NA的行返回的是true，那么自然被保存。

上述运行结果如下：

  gene hsap mmul mmus rnor cfam

2 ENSG00000199674 0 2 2 2 2

6 ENSG00000221312 0 1 2 3 2

如果想过滤部分列：

final[complete.cases(final[,:]),]

即只对5、6列进行判断。

   gene hsap mmul mmus rnor cfam

2 ENSG00000199674 0 2 2 2 2

4 ENSG00000207604 0 NA NA 1 2

6 ENSG00000221312 0 1 2 3 2

这样第四行含有空值，但是，我们的命令是只过滤掉第5列，第6列中含有NA的行。

2.3 any函数

The value returned is TRUE if at least one of the values in x is TRUE, and FALSE if all of the values in x are FALSE

//即如果有一个值为真，那么则返回真；如果全部为假，那么则返回假。

> range(x <- sort(round(stats::rnorm() - 1.2, )))

[] -2.6  0.5

> if(any(x < )) cat("x contains negative values\n")

x contains negative values
#

2.4 range函数

取向量中最大值与最小值。

> (r.x <- range(stats::rnorm()))

[] -1.805257  3.410545

#产生100个服从正态分布的数，使用range函数取最值。

2.5 rnorm函数

rnorm(n, mean = , sd = )

产生n个符合生态分布的数，缺省的均值是0，标准差是1

2.6 ngettext函数

> for(n in :)

+     print(sprintf(ngettext(n, "%d variable has missing values",

+                            "%d variables have missing values"),

+                   n))

[] "0 variables have missing values"

[] "1 variable has missing values"

[] "2 variables have missing values"

[] "3 variables have missing values"

//我认为是和C语言中的printf是比较像的，就是将其中的数进行匹配即可。

2.7 stop函数

> iter <-

> try(if(iter > ) stop("too many iterations"))

Error in try(if (iter > ) stop("too many iterations")) :

  too many iterations

#能够终止函数的执行

2.8 drop函数

对于多维数据，去掉长度为1的维度。

> dim(drop(array(:, dim = c(,,,,,,))))

[]

上例中，即去掉第一维、第三四维、第六维这四个维度，那么还剩下的是这些。

> x<-drop(array(:, dim = c(,,,,,,)))

> x

, , 

     [,] [,]

[,]

[,]

[,]        

, , 

     [,] [,]

[,]

[,]

[,]

dim是给数组赋予维数：

> z<-c(:)

> dim(z)<-c(,)

> z

     [,] [,] [,]

[,]

[,]

> z<-c(:)

> dim(z)<-c(,)

> z

     [,] [,] [,] [,] [,] [,]

[,]

> drop(z)

[]

> z<-c(:)

> dim(z)<-c(,)

> z

     [,]

[,]

[,]

[,]

[,]

[,]

[,]

> drop(z)

[]      

#试验如果维度有一个为1的话，那么会是什么情况

那么似乎是去掉那个为1的维度，然后其他正常，此时

> z<-drop(z)

> dim(z)

NULL